Blame - ethosu/vela/register_command_stream_generator.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

1

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

16

# Description:

17

# Register level (low-level) command stream generation for Ethos-U55. Takes a high-level command stream and generates

18

# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit

19

# stream suitable for interpretation by the Ethos-U55 processor.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

20

from collections import defaultdict

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

21

from enum import Enum

22

from enum import IntEnum

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

import numpy as np

from . import scaling

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

27

from .architecture_features import ArchitectureFeatures

28

from .architecture_features import Block

29

from .architecture_features import Kernel

30

from .architecture_features import Rect

31

from .architecture_features import SharedBufferArea

32

from .architecture_features import SHRAMElements

33

from .data_type import BaseType

34

from .data_type import DataType

35

from .ethos_u55_regs.ethos_u55_regs import acc_format

36

from .ethos_u55_regs.ethos_u55_regs import activation

37

from .ethos_u55_regs.ethos_u55_regs import cmd0

38

from .ethos_u55_regs.ethos_u55_regs import cmd1

39

from .ethos_u55_regs.ethos_u55_regs import elementwise_mode

40

from .ethos_u55_regs.ethos_u55_regs import ifm_precision

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

41

from .ethos_u55_regs.ethos_u55_regs import pooling_mode

Jacob Bohlin

cf7da10

2020-05-20 09:03:40 +0200

[diff] [blame]

42

from .ethos_u55_regs.ethos_u55_regs import resampling_mode

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

43

from .ethos_u55_regs.ethos_u55_regs import rounding

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

44

from .high_level_command_stream import CommandType

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

45

from .numeric_util import clamp_sigmoid

46

from .numeric_util import clamp_tanh

Louis Verhaard

2020-06-04 15:51:24 +0200

[diff] [blame]

47

from .numeric_util import full_shape

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

48

from .numeric_util import quantise_float32

49

from .numeric_util import round_away_zero

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

50

from .numeric_util import round_up_to_int

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

51

from .operation import NpuBlockType

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

52

from .shared_buffer_allocation import SharedBufferAllocation

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame]

53

from .tensor import MemType

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

54

from .tensor import TensorBlockTraversal

55

from .tensor import TensorFormat

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

56

from .tensor import TensorPurpose

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

57

58

59

class RegisterMachine:

60

def __init__(self):

61

self.n_banks = 1

62

self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]

63

self.bank_idx = 0

64

65

def set_register(self, reg, value):

66

is_changed = self.registers[self.bank_idx][reg] != value

67

self.registers[self.bank_idx][reg] = value

68

# is_changed = True # force command

69

return is_changed

70

71

def switch_bank(self):

72

self.bank_idx = (self.bank_idx + 1) % self.n_banks

73

74

75

class CmdMode(IntEnum):

NoPayload = 0x0000

Payload32 = 0x4000

Mask = 0xC000

CmdOpMask = 0x03FF

class BasePointerIndex(IntEnum):

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame]

83

WeightTensor = 0 # base address index for the Weight tensor

84

ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena

85

ScratchFastTensor = 2 # base address for the Scratch_fast_tensor

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

86

Mem2Mem = (1 << 8) | (3 << 0) # base address slot for memory 2 memory transfer

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

87

88

89

# TODO: Replace with definitions from ethos_u55_regs

90

class IFM2Broadcast(IntEnum):

91

BroadcastHdim = 1 << 0

92

BroadcastWdim = 1 << 1

93

BroadcastCdim = 1 << 2

94

ReverseOperandOrder = 1 << 6

95

UseIFM2Scalar = 1 << 7

96

97

98

class CommandStreamEmitter:

99

def __init__(self):

100

self.cmd_stream = []

101

self.reg_machine = [RegisterMachine(), RegisterMachine()]

102

self.last_absolute_wait = defaultdict(int)

103

104

def get_reg_machine(self, cmd):

105

if "DMA" in cmd.name:

106

return self.reg_machine[1]

107

else:

108

return self.reg_machine[0]

109

110

def size_in_bytes(self):

111

sz = 0

112

for cmd in self.cmd_stream:

sz += len(cmd) * 4

return sz

def to_list(self):

return [elem for cmd in self.cmd_stream for elem in cmd]

118

119

def print_cmds(self):

120

print("Code: Command: Param: Payload:")

121

for words_for_one_command in self.cmd_stream:

122

code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits

123

param = words_for_one_command[0] >> 16 # higher 16 bits

124

125

payload_mode = CmdMode(code & CmdMode.Mask)

126

127

# code and command

128

s = " 0x%04x " % code

129

if payload_mode == CmdMode.NoPayload:

130

s += str(cmd0(code & CmdMode.CmdOpMask))

131

else:

132

s += str(cmd1(code & CmdMode.CmdOpMask))

s = s.ljust(40)

s += "%5d" % param

# payload

if payload_mode == CmdMode.Payload32:

139

s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])

else:

s += " -"

print(s)

def cmd0_with_param(self, cmd, param):

146

if isinstance(param, Enum):

147

param = int(param.value)

148

else:

149

param = int(param)

150

param = param & 0xFFFF

151

command = cmd.value | (param << 16)

152

if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):

153

return

154

155

# This is not a redundant command, actually write it

156

self.cmd_stream.append((command,))

157

158

def cmd1_with_offset(self, cmd, offset, param=0x0):

159

offset = int(offset) & 0xFFFFFFFFF

160

command = cmd.value | CmdMode.Payload32.value | (param << 16)

161

162

if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):

163

return

164

165

# This is not a redundant command, actually write it

166

self.cmd_stream.append((command, offset))

167

168

def cmd_wait(self, cmd, param, absolute_wait_time):

169

if absolute_wait_time <= self.last_absolute_wait[cmd]:

170

return

171

172

self.last_absolute_wait[cmd] = absolute_wait_time

173

param = int(param)

174

command = ((param & 0xFFFF) << 16) | cmd.value

175

self.cmd_stream.append((command,))

176

177

def cmd_do_operation(self, cmd, param=0):

178

param = int(param)

179

command = ((param & 0xFFFF) << 16) | cmd.value

180

181

self.cmd_stream.append((command,))

182

self.get_reg_machine(cmd).switch_bank()

183

184

185

def calc_command_dependencies(cmd_stream, arch):

cmd_starts = {}

cmd_ends = {}

memory_accesses = {}

# Keep track of accumulated number of commands in command stream.

191

# First element kernel ops: (# of blocks, # of commands)

192

# Second element DMA ops: (# of commands)

Michael McGeagh

8677e53

2020-07-28 11:32:22 +0100

[diff] [blame]

193

pos = np.array((np.array((0, 0)), np.array([0])), dtype=object)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

dependencies = {}

for cmd in cmd_stream:

198

cmd_starts[cmd] = pos

199

op_count = cmd.get_operation_count()

200

# Keep track of both num blocks and commands

201

cmd_add = 0 if (op_count[0] == 0) else 1

Michael McGeagh

8677e53

2020-07-28 11:32:22 +0100

[diff] [blame]

202

pos = np.array((pos[0] + np.array((op_count[0], cmd_add)), pos[1] + np.array([op_count[1]])), dtype=object)

203

cmd_ends[cmd] = np.array((pos[0], pos[1]), dtype=object)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

204

memory_accesses[cmd] = cmd.get_memory_accesses()

205

206

for idx, cmd in enumerate(cmd_stream):

207

curr_accesses = memory_accesses[cmd]

208

# Keep track of command dependency.

209

# First element kernel ops: (# of blocks, # of commands)

210

# Second element DMA ops: (# of commands)

Michael McGeagh

8677e53

2020-07-28 11:32:22 +0100

[diff] [blame]

211

dep_offsets = np.array((np.array((-1, -1)), np.array([-1])), dtype=object)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

212

dep_cmds = [None] * CommandType.Size.value

213

if idx > 0:

214

# Look at the previous commands in backwards order

215

for prev_cmd in cmd_stream[idx - 1 :: -1]:

216

assert prev_cmd is not cmd

217

if dep_cmds[prev_cmd.cmdtype] is None:

218

is_dependency = False

219

if cmd.cmdtype == CommandType.NpuStripe and prev_cmd.cmdtype == CommandType.NpuStripe:

220

# Special handling here, as dpu -> dpu operations require additional care

221

if not SharedBufferAllocation.is_compatible(prev_cmd.ps.shared_buffer, cmd.ps.shared_buffer):

222

is_dependency = True

223

elif memory_accesses[prev_cmd].conflicts(curr_accesses):

224

is_dependency = True

225

else:

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

226

if memory_accesses[prev_cmd].conflicts(curr_accesses) or (

227

prev_cmd.cmdtype == CommandType.DMA and prev_cmd.in_tensor.purpose == TensorPurpose.LUT

228

):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

is_dependency = True

if is_dependency:

new_offset = cmd_ends[prev_cmd][prev_cmd.cmdtype]

233

if new_offset[0] > dep_offsets[prev_cmd.cmdtype][0]:

234

dep_cmds[prev_cmd.cmdtype] = prev_cmd

235

dep_offsets[prev_cmd.cmdtype] = new_offset

236

237

# Check if we've got dependencies for all commands, in which case we can early out

for dep in dep_cmds:

if dep is None:

break

else:

break # all handled

# Convert absolute to relative dependencies, using None to signal the special case of no

245

# dependency of this kind

246

res = [None] * CommandType.Size.value

247

for i in range(CommandType.Size.value):

248

if dep_cmds[i] is not None:

249

res[i] = cmd_starts[cmd][i] - dep_offsets[i]

250

251

dependencies[cmd] = cmd_starts[cmd], res

return dependencies

def get_op_kernel(ps):

257

if ps.primary_op is None:

258

return None

259

260

strides = ps.primary_op.attrs.get("strides", (1, 1, 1, 1))

261

dilation = ps.primary_op.attrs.get("dilation", (1, 1, 1, 1))

262

if ps.weight_tensor:

263

if ps.npu_block_type in set((NpuBlockType.VectorProduct, NpuBlockType.ElementWise)):

k_h = 1

k_w = 1

else:

k_h = ps.weight_tensor.shape[0]

268

k_w = ps.weight_tensor.shape[1]

269

else:

270

k_h = ps.primary_op.attrs.get("filter_height", 1)

271

k_w = ps.primary_op.attrs.get("filter_width", 1)

272

273

return Kernel(k_w, k_h, strides[2], strides[1], dilation[2], dilation[1])

274

275

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

276

def has_prev_op_dependency(prev_cmd, cmd):

277

if prev_cmd is None:

278

return False

279

if (prev_cmd.cmdtype == cmd.cmdtype == CommandType.NpuStripe) and (prev_cmd.ps != cmd.ps):

Tim Hall

9033795

2020-05-07 16:42:35 +0100

[diff] [blame]

280

if prev_cmd.ofm_tensor.equivalence_id == cmd.ifm_tensor.equivalence_id:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

281

return True

Tim Hall

9033795

2020-05-07 16:42:35 +0100

[diff] [blame]

282

elif cmd.ifm2_tensor is not None:

Louis Verhaard

2020-06-04 15:51:24 +0200

[diff] [blame]

283

return prev_cmd.ofm_tensor.equivalence_id == cmd.ifm2_tensor.equivalence_id

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

return False

def get_op_ofm_rect(cmd):

Charles Xu

3e9c434

2020-04-22 08:31:43 +0200

[diff] [blame]

288

start = full_shape(4, cmd.ofm_box.start_coord, 0)

289

end = full_shape(4, cmd.ofm_box.end_coord, 1)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

290

return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1)

291

292

293

def get_op_ifm_rect(cmd):

Charles Xu

3e9c434

2020-04-22 08:31:43 +0200

[diff] [blame]

294

start = full_shape(4, cmd.ifm_box.start_coord, 0)

295

end = full_shape(4, cmd.ifm_box.end_coord, 1)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

296

return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1)

297

298

299

def get_op_ifmofm_block_depth(arch, cmd):

300

# Note: NOT equivalent to the normal ifm block depth calculation since

301

# it takes into account 'depthless' block operations by returning full

302

# depth

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

303

if cmd.ps.npu_block_type in (

304

NpuBlockType.ConvolutionDepthWise,

305

NpuBlockType.Pooling,

306

NpuBlockType.ElementWise,

307

NpuBlockType.ReduceSum,

308

):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

309

return cmd.ofm_box.get_size_shape()[-1]

310

311

return arch.calc_ifm_block_depth(cmd.ifm_box.get_size_shape()[-1], cmd.ifm_tensor.dtype.bits)

312

313

314

def get_op_padding_lt(cmd):

315

if cmd.ps.npu_block_type not in (

316

NpuBlockType.ConvolutionDepthWise,

317

NpuBlockType.Pooling,

318

NpuBlockType.ConvolutionMxN,

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

319

NpuBlockType.ReduceSum,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

):

return (0, 0)

explicit_padding = list(cmd.ps.primary_op.attrs["explicit_padding"]) # (top, left, bottom, right)

324

325

# Check if this is for horizontal ifm streaming

326

if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):

327

explicit_padding[0] = cmd.pad_top

328

explicit_padding[2] = cmd.pad_bottom

329

330

return (explicit_padding[1], explicit_padding[0])

331

332

Jacob Bohlin

e99b893

2020-07-13 16:01:51 +0200

[diff] [blame^]

333

def ifm_ifm2_correct_order(ifm_shape, ifm2_shape):

334

if ifm_shape == []:

335

# Scalar needs to be in IFM2

336

return False

337

elif ifm2_shape == []:

338

return True

339

340

for ifm, ifm2 in zip(ifm_shape, ifm2_shape):

341

if ifm != ifm2 and ifm == 1:

342

# Broadcasted FM needs to be in IFM2

return False

return True

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

348

def generate_register_command_stream(nng, sg, arch, verbose=False):

349

emit = CommandStreamEmitter()

350

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame]

351

if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area:

352

base_ptr_idx_map = {

353

MemType.Permanent_NPU: BasePointerIndex.WeightTensor,

354

MemType.Permanent_CPU: BasePointerIndex.WeightTensor,

355

MemType.Scratch: BasePointerIndex.ScratchTensor,

356

MemType.Scratch_fast: BasePointerIndex.ScratchTensor,

}

else:

base_ptr_idx_map = {

MemType.Permanent_NPU: BasePointerIndex.WeightTensor,

361

MemType.Permanent_CPU: BasePointerIndex.WeightTensor,

362

MemType.Scratch: BasePointerIndex.ScratchTensor,

363

MemType.Scratch_fast: BasePointerIndex.ScratchFastTensor,

364

}

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

365

366

# Maps an AccumulatorType enum to the corresponding acc_format value

367

acc_format_map = {

368

SHRAMElements.Acc16: acc_format.FP_S5_10.value,

369

SHRAMElements.Acc32: acc_format.INT_32BIT.value,

370

SHRAMElements.Acc40: acc_format.INT_40BIT.value,

371

}

372

373

# Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE

374

elementwise_mode_map = {

375

"MulAct": elementwise_mode.MUL.value,

376

"AddAct": elementwise_mode.ADD.value,

377

"SubAct": elementwise_mode.SUB.value,

378

"Minimum": elementwise_mode.MIN.value,

379

"Maximum": elementwise_mode.MAX.value,

380

"LeakyRelu": elementwise_mode.LRELU.value,

381

"Abs": elementwise_mode.ABS.value,

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

382

"CLZ": elementwise_mode.CLZ.value,

383

"SHR": elementwise_mode.SHR.value,

384

"SHL": elementwise_mode.SHL.value,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

}

cmd_stream = []

for cmd in sg.high_level_command_stream:

389

if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default:

390

print("Warning: Skipping register command stream generation for", cmd.ps)

391

else:

392

cmd_stream.append(cmd)

393

394

dependencies = calc_command_dependencies(cmd_stream, arch)

395

396

# Initialise operator dependency state

397

prev_ifm_rect = cur_ifm_rect = None

398

prev_ifm_block_depth = cur_ifm_block_depth = None

399

prev_ofm_rect = cur_ofm_rect = None

400

prev_ofm_block = cur_ofm_block = None

401

prev_kernel = cur_kernel = None

402

prev_cmd = None

403

404

def emit_wait_commands(cmd):

405

# The command is fully set up, emit whatever wait commands we need

406

absolute_dep, relative_dep = dependencies[cmd]

407

if relative_dep[CommandType.NpuStripe] is not None:

408

if cmd.cmdtype == CommandType.DMA:

409

param = relative_dep[CommandType.NpuStripe][1]

410

if param <= 3:

411

emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, param, absolute_dep[CommandType.NpuStripe][1])

412

else:

413

param = relative_dep[CommandType.NpuStripe][0]

414

param = min(param, 0xFFFF) # Clamp to allowable wait amount

415

416

if relative_dep[CommandType.DMA] is not None:

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame]

417

# TODO This can be optimized for yoda

418

param = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

419

emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, param, absolute_dep[CommandType.DMA][0])

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

420

Tim Hall

42e4189

2020-07-06 10:51:31 +0100

[diff] [blame]

421

if arch.is_yoda_system:

Jacob Bohlin

2020-07-09 11:16:30 +0200

[diff] [blame]

422

emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

423

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

424

for cmd in cmd_stream:

425

if cmd.cmdtype == CommandType.DMA:

426

start_coord = cmd.box.start_coord

427

428

src_addr = cmd.in_tensor.address_for_coordinate(start_coord)

429

dst_addr = cmd.out_tensor.address_for_coordinate(start_coord)

430

431

if cmd.in_tensor.compressed_values is not None:

432

stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord)

433

sz = cmd.in_tensor.size_of_compressed_stream(stream_index)

434

else:

435

sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr

436

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame]

437

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, base_ptr_idx_map[cmd.in_tensor.mem_type])

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

438

emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, src_addr)

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

439

if cmd.out_tensor.purpose == TensorPurpose.LUT:

440

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, BasePointerIndex.Mem2Mem)

441

else:

442

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, base_ptr_idx_map[cmd.out_tensor.mem_type])

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame]

443

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

444

emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dst_addr)

445

emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, sz)

446

dma_channel = 0

447

mode = 0 # From external to external

448

449

emit_wait_commands(cmd)

450

emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, dma_channel * 16 + mode)

451

452

elif cmd.cmdtype == CommandType.NpuStripe:

453

454

ps = cmd.ps

455

primary_op = ps.primary_op

456

npu_block_type = ps.npu_block_type

457

# Specifies if global scale from the NPU_SET_OFM_SCALE register should be used instead of per-channel scale

458

use_global_scale = False

459

# Specifies type of rounding to be used.

460

rounding_mode = rounding.TFL

Louis Verhaard

2020-06-04 15:51:24 +0200

[diff] [blame]

461

if primary_op.type == "ResizeBilinear":

Dwight Lidman

3ec04ac

2020-04-30 11:54:48 +0200

[diff] [blame]

462

rounding_mode = rounding.TRUNCATE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

463

fmf = primary_op.attrs.get("fused_memory_function", None)

464

faf = primary_op.attrs.get("fused_activation_function", None)

Jacob Bohlin

9fbc491

2020-06-29 11:58:50 +0200

[diff] [blame]

465

fused_quantize = any(op.type == "Quantize" for op in ps.ops)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

466

467

# Specifies which operand to apply scaling to in bitexact elementwise ADD/SUB

468

op_to_scale = 0

469

470

# Update state history

471

prev_ifm_rect = cur_ifm_rect

472

prev_ifm_block_depth = cur_ifm_block_depth

473

prev_ofm_rect = cur_ofm_rect

474

prev_ofm_block = cur_ofm_block

475

prev_kernel = cur_kernel

Louis Verhaard

2020-06-04 15:51:24 +0200

[diff] [blame]

476

cur_kernel = get_op_kernel(ps)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

477

478

block_config = ps.block_config

479

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config[0] - 1)

480

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config[1] - 1)

481

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config[3] - 1)

482

483

shared_buffer = ps.shared_buffer

484

485

if npu_block_type == NpuBlockType.ElementWise:

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

486

ifm2_broadcast = (

487

IFM2Broadcast.ReverseOperandOrder if primary_op.attrs.get("reverse_op_order", False) else 0

488

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

489

Jacob Bohlin

e99b893

2020-07-13 16:01:51 +0200

[diff] [blame^]

490

if not ifm_ifm2_correct_order(cmd.ifm_tensor.shape, cmd.ifm2_tensor.shape):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

491

# The scalar has to be the ifm2 tensor so switch the ifms

492

cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor

493

cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box

494

495

# Set ReverseOperandOrder bit to IFM2_BROADCAST

496

ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder

497

498

# Calculate scales needed for arithmetic elementwise operators

499

if primary_op.type in set(("AddAct", "MulAct", "SubAct",)):

500

input_scale = cmd.ifm_tensor.quantization.scale_f32

501

input2_scale = cmd.ifm2_tensor.quantization.scale_f32

502

output_scale = cmd.ofm_tensor.quantization.scale_f32

503

use_global_scale = True

504

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

505

if output_scale is not None and faf in ("Sigmoid", "Tanh"):

506

output_scale = 1 / 0x3000

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

507

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

508

if primary_op.type == "MulAct":

509

if None in (input_scale, input2_scale, output_scale):

ofm_scale = 1

shift = 0

else:

ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

514

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

515

else: # AddAct/SubAct

Charles Xu

9a03fdf

2020-07-02 15:12:40 +0200

[diff] [blame]

516

# Force output scale same as the input scale for

517

# resizebiliner 1x1 that is converted to add

518

if "resizebilinear" in primary_op.attrs:

519

output_scale = input2_scale

520

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

521

if None in (input_scale, input2_scale, output_scale):

522

opa_scale = opb_scale = ofm_scale = 1

523

opa_shift = shift = 0

524

elif input_scale == input2_scale:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

525

opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(

526

input_scale, input2_scale, output_scale

527

)

528

opa_shift = 0 # Unused for this case

529

else:

530

# Use advanced implementation only when input scales differ

531

bitdepth = cmd.ifm_tensor.dtype.bits

(

opa_scale,

opa_shift,

ofm_scale,

shift,

op_to_scale,

) = scaling.advanced_elementwise_add_sub_scale(

539

input_scale, input2_scale, output_scale, bitdepth

540

)

541

opb_scale = 0 # Unused for this case

542

if ifm2_broadcast & IFM2Broadcast.ReverseOperandOrder:

543

# If the operand order is reversed we also have to swap which operand is scaled

544

if op_to_scale == scaling.OperandToScale.OPa:

545

op_to_scale = scaling.OperandToScale.OPb

546

else:

547

op_to_scale = scaling.OperandToScale.OPa

548

549

emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)

550

emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)

551

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

552

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

553

elif primary_op.type in set(("LeakyRelu", "Abs",)):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

554

output_scale = cmd.ofm_tensor.quantization.scale_f32

555

use_global_scale = True

556

557

if primary_op.type == "LeakyRelu":

558

output_scale *= primary_op.attrs["alpha"]

559

560

ofm_scale, shift = scaling.quantise_scale(output_scale)

561

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

562

else:

563

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

564

565

# For elementwise set the required SHRAM to be equal to the total size of SHRAM

566

shram_required = arch.shram_total_banks

567

emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)

568

569

# Acc buffers not needed so set AB_START to size of SHRAM

570

emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch.shram_total_banks)

571

572

# Is not a unary operator

573

if cmd.ifm2_tensor is not None:

574

if cmd.ifm2_tensor.shape == []:

575

# IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST

576

ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar

577

else:

578

ifm_box_shape = cmd.ifm_box.get_size_shape()

579

ifm2_box_shape = cmd.ifm2_box.get_size_shape()

580

581

if len(cmd.ifm_tensor.shape) > 1 and ifm_box_shape[1] != ifm2_box_shape[1]:

582

# Broadcast in 'H' dimension

583

assert cmd.ifm2_tensor.shape[1] == 1

584

ifm2_broadcast |= IFM2Broadcast.BroadcastHdim

585

586

if len(cmd.ifm_tensor.shape) > 2 and ifm_box_shape[2] != ifm2_box_shape[2]:

587

# Broadcast in 'W' dimension

588

assert cmd.ifm2_tensor.shape[2] == 1

589

ifm2_broadcast |= IFM2Broadcast.BroadcastWdim

590

591

if len(cmd.ifm_tensor.shape) > 3 and ifm_box_shape[3] != ifm2_box_shape[3]:

592

# Broadcast in 'C' dimension

593

assert cmd.ifm2_tensor.shape[3] == 1

594

ifm2_broadcast |= IFM2Broadcast.BroadcastCdim

595

596

# Set IFM2_IB_START to the latter half of the IB space

597

ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]

598

emit.cmd0_with_param(

599

cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) / 2 + ifm_ib_start

600

)

601

602

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)

603

604

else:

605

emit.cmd0_with_param(

606

cmd0.NPU_SET_IFM_IB_END,

607

shared_buffer.bank_locations[SharedBufferArea.IFM]

608

+ shared_buffer.banks_required[SharedBufferArea.IFM],

609

)

610

emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])

611

612

emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])

613

Louis Verhaard

2020-06-04 15:51:24 +0200

[diff] [blame]

614

if primary_op.type == "ResizeBilinear":

Dwight Lidman

3ec04ac

2020-04-30 11:54:48 +0200

[diff] [blame]

615

# perform nearest neighbor upscale

Jacob Bohlin

cf7da10

2020-05-20 09:03:40 +0200

[diff] [blame]

616

emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.NEAREST)

617

elif primary_op.type == "Conv2DBackpropInputSwitchedBias":

618

# perform insert zero upscale

619

emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.TRANSPOSE)

Dwight Lidman

3ec04ac

2020-04-30 11:54:48 +0200

[diff] [blame]

620

else:

Jacob Bohlin

cf7da10

2020-05-20 09:03:40 +0200

[diff] [blame]

621

emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.NONE)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

622

623

if npu_block_type in set(

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

624

(

625

NpuBlockType.ConvolutionMxN,

626

NpuBlockType.ConvolutionDepthWise,

627

NpuBlockType.Pooling,

628

NpuBlockType.ReduceSum,

629

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

630

):

631

# Set up padding

632

explicit_padding = list(primary_op.attrs["explicit_padding"]) # (top, left, bottom, right)

633

634

# Check if this is for horizontal ifm streaming

635

if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):

636

explicit_padding[0] = cmd.pad_top

637

explicit_padding[2] = cmd.pad_bottom

638

639

# Indexing from end since a 1x1 Avgpool might have been added with non 4-dimensional input/output,

640

# because of activation function needed to be fused.

641

if cmd.ifm_box.start_coord[-2] > 0:

642

explicit_padding[1] = 0

643

if cmd.ifm_box.end_coord[-2] < cmd.ifm_tensor.shape[-2]:

644

explicit_padding[3] = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

645

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, explicit_padding[0])

646

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, explicit_padding[1])

647

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, explicit_padding[2])

648

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, explicit_padding[3])

649

Dwight Lidman

0538a77

2020-05-06 14:09:17 +0200

[diff] [blame]

650

# set kernel x stride low bit

651

stride = primary_op.attrs["strides"][2] - 1 & 1

652

# set kernel y stride low bit

653

stride |= (primary_op.attrs["strides"][1] - 1 & 1) << 1

654

# set kernel x stride extension bits

655

stride |= (primary_op.attrs["strides"][2] - 1 >> 1) << 6

656

# set kernel y stride extension bits

657

stride |= (primary_op.attrs["strides"][1] - 1 >> 1) << 9

658

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

659

if npu_block_type in set((NpuBlockType.Pooling, NpuBlockType.ReduceSum)):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

660

k_height, k_width = primary_op.attrs["ksize"][1:3]

661

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, k_height - 1)

662

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, k_width - 1)

663

664

valid_padding = sum(explicit_padding) == 0

665

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

666

if (

667

primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear", "ReduceSum"))

668

and valid_padding

669

):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

670

# For valid padding vela has to output scaling values

671

if faf == "Sigmoid" or faf == "Tanh":

672

rescale = 0x3000 * cmd.ifm_tensor.quantization.scale_f32

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

673

Fredrik Svedberg

2020-05-19 10:43:01 +0200

[diff] [blame]

674

if cmd.ifm_tensor.dtype == DataType.int16:

Charles Xu

749d921

2020-06-11 12:39:19 +0200

[diff] [blame]

675

multiplier = max(1, int(4096 * cmd.ifm_tensor.quantization.scale_f32 + 0.5))

Fredrik Svedberg

2020-05-19 10:43:01 +0200

[diff] [blame]

676

rescale *= 3 * multiplier

677

678

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

679

scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)

Fredrik Svedberg

2020-05-19 10:43:01 +0200

[diff] [blame]

680

681

if cmd.ifm_tensor.dtype == DataType.int16:

682

scale = (1 << shift) * 3 * multiplier

683

else:

684

scale = int(round_away_zero(scale * rescale))

Jacob Bohlin

9fbc491

2020-06-29 11:58:50 +0200

[diff] [blame]

685

elif fused_quantize:

686

# Quantize op requires different scaling

687

ifm_scale_f64 = np.double(cmd.ifm_tensor.quantization.scale_f32)

688

ofm_scale_f64 = np.double(cmd.ofm_tensor.quantization.scale_f32)

689

scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

690

else:

691

# In case avg pool fused with concat or other memory operation, rescaling might be needed.

692

# k_height == k_width == 1 is allways true in this case

693

# Normally the scale is maximised, to get maximum precision, which means that

694

# if rescale != 1, scale need to consider the number of bits needed for rescaling

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

695

if None not in (

696

cmd.ofm_tensor.quantization.scale_f32,

697

cmd.ifm_tensor.quantization.scale_f32,

698

):

699

rescale = cmd.ifm_tensor.quantization.scale_f32 / cmd.ofm_tensor.quantization.scale_f32

700

rescale_bits = 0

701

if k_height == k_width == 1:

702

if fmf == "ConcatSliceWrite":

703

rounding_mode = rounding.NATURAL

704

if rescale > 1:

705

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

706

elif rescale < 1:

707

rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)

708

scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)

709

scale = int(round_away_zero(scale * rescale))

710

else:

711

scale = 1

712

shift = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

713

714

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)

715

# Valid-padded average pool should use the global scale from

716

# NPU_SET_OFM_SCALE register, which is set above.

717

use_global_scale = True

718

719

else: # Convolution

720

assert cmd.weight_tensor.block_traversal != TensorBlockTraversal.Default

Fredrik Svedberg

d67c0aa

2020-03-30 13:15:28 +0200

[diff] [blame]

721

# Reduced precision quantization and natural rounding used for int16

722

if cmd.ifm_tensor.dtype == DataType.int16:

723

rounding_mode = rounding.NATURAL

Louis Verhaard

2020-06-04 15:51:24 +0200

[diff] [blame]

724

stride |= (cur_kernel.dilation.y - 1) << 4

725

stride |= (cur_kernel.dilation.x - 1) << 3

726

emit.cmd0_with_param(

727

cmd0.NPU_SET_KERNEL_HEIGHT_M1, cur_kernel.dilation.y * (cmd.weight_tensor.shape[0] - 1)

728

)

729

emit.cmd0_with_param(

730

cmd0.NPU_SET_KERNEL_WIDTH_M1, cur_kernel.dilation.x * (cmd.weight_tensor.shape[1] - 1)

731

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

732

if cmd.weight_tensor.block_traversal == TensorBlockTraversal.PartKernelFirst:

733

# Part-kernel-first weight ordering

734

assert npu_block_type == NpuBlockType.ConvolutionMxN

735

stride |= 1 << 2

736

737

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)

738

739

elif npu_block_type in set((NpuBlockType.VectorProduct,)):

740

# Vector product is implemented using a 1x1 convolution so need

741

# to setup the appropriate padding and kernel info

742

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, 0)

743

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, 0)

744

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, 0)

745

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, 0)

746

747

# kernel stride reg = 0 means stride(1,1) + depth first weight

748

# order + dilation(0,0) + kernel_split_size=8

749

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, 0)

750

751

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, 0)

752

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, 0)

753

754

if npu_block_type in set(

755

(NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)

756

):

757

# Emit Weight base address commands, only maps the area required for

758

# this command's weights from the larger tensor.

759

stream_index = cmd.weight_tensor.compressed_stream_index_from_coord(cmd.weight_box.start_coord)

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

760

weight_substream_offsets = cmd.weight_tensor.compressed_values_substream_offsets[stream_index]

Jacob Bohlin

2020-07-09 11:16:30 +0200

[diff] [blame]

761

substreams = len(weight_substream_offsets) - 1 # Offset list must terminate with full stream length

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

762

763

# Extract weight substream offsets and calculate their lengths

764

assert len(weight_substream_offsets) > 1 and (weight_substream_offsets[0] == 0)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

765

weight_addr = cmd.weight_tensor.address_for_coordinate(cmd.weight_box.start_coord)

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

766

Tim Hall

2020-06-25 16:55:02 +0100

[diff] [blame]

767

# Set weights sources for active and present cores

Jacob Bohlin

2020-07-09 11:16:30 +0200

[diff] [blame]

768

for core, param in enumerate(

769

[

770

(cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),

771

(cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),

772

]

773

):

Tim Hall

2020-06-25 16:55:02 +0100

[diff] [blame]

774

if core < substreams:

Jacob Bohlin

2020-07-09 11:16:30 +0200

[diff] [blame]

775

emit.cmd1_with_offset(param[0], weight_addr + weight_substream_offsets[core])

776

emit.cmd1_with_offset(

777

param[1], weight_substream_offsets[core + 1] - weight_substream_offsets[core]

778

)

Tim Hall

2020-06-25 16:55:02 +0100

[diff] [blame]

779

elif core < arch.ncores:

780

emit.cmd1_with_offset(param[0], weight_addr)

781

emit.cmd1_with_offset(param[1], 0)

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

782

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame]

783

weight_region = base_ptr_idx_map[cmd.weight_tensor.mem_type]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

784

emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weight_region)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

785

786

# Emit Scale & Bias base address commands, with length matching the amount required by

787

# the weight tensors.

788

if cmd.scale_tensor is not None:

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

789

scale_substream_offsets = cmd.scale_tensor.compressed_values_substream_offsets[stream_index]

Jacob Bohlin

2020-07-09 11:16:30 +0200

[diff] [blame]

790

substreams = len(scale_substream_offsets) - 1 # Offset list must terminate with full stream length

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

791

792

# Extract scale substream offsets and calculate their lengths

793

assert len(scale_substream_offsets) > 1 and (scale_substream_offsets[0] == 0)

Jacob Bohlin

2020-07-09 11:16:30 +0200

[diff] [blame]

794

scale_addr = cmd.scale_tensor.address_for_coordinate(cmd.weight_box.start_coord[-1:])

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

795

Tim Hall

2020-06-25 16:55:02 +0100

[diff] [blame]

796

# Set scale sources for active and present cores

Jacob Bohlin

2020-07-09 11:16:30 +0200

[diff] [blame]

797

for core, param in enumerate(

798

[

799

(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH),

800

(cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH),

801

]

802

):

Tim Hall

2020-06-25 16:55:02 +0100

[diff] [blame]

803

if core < substreams:

Jacob Bohlin

2020-07-09 11:16:30 +0200

[diff] [blame]

804

emit.cmd1_with_offset(param[0], scale_addr + scale_substream_offsets[core])

805

emit.cmd1_with_offset(

806

param[1], scale_substream_offsets[core + 1] - scale_substream_offsets[core]

807

)

Tim Hall

2020-06-25 16:55:02 +0100

[diff] [blame]

808

elif core < arch.ncores:

809

emit.cmd1_with_offset(param[0], scale_addr)

810

emit.cmd1_with_offset(param[1], 0)

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

811

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

812

# Emit base address for NPU to access scale & bias data

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame]

813

scale_region = base_ptr_idx_map[cmd.scale_tensor.mem_type]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

814

emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, scale_region)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

815

816

ofm_quant = cmd.ofm_tensor.quantization

817

ofm_quant_qmin = cmd.ofm_tensor.quantization.quant_min

818

ofm_quant_qmax = cmd.ofm_tensor.quantization.quant_max

819

ifm_min = cmd.ifm_tensor.quantization.min

820

ifm_max = cmd.ifm_tensor.quantization.max

821

822

# Emit commands for any fused activation function

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

823

if faf is None:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

824

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)

825

# Even if no activation function, values need to be set to override previous values

826

faf_min = ofm_quant_qmin

827

faf_max = ofm_quant_qmax

828

elif faf == "Relu":

829

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)

830

faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)

831

faf_max = ofm_quant_qmax

832

elif faf == "Relu6":

833

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)

834

faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)

835

faf_max = quantise_float32(6.0, ofm_quant.scale_f32, ofm_quant.zero_point)

836

elif faf == "ReluN1To1":

837

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)

838

faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point)

839

faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)

840

elif faf == "Tanh":

841

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.TANH)

Fredrik Svedberg

2020-05-19 10:43:01 +0200

[diff] [blame]

842

if primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear")):

843

faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point)

844

faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)

845

else:

846

faf_min = quantise_float32(clamp_tanh(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)

847

faf_max = quantise_float32(clamp_tanh(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

848

elif faf == "Sigmoid":

849

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.SIGMOID)

Fredrik Svedberg

2020-05-19 10:43:01 +0200

[diff] [blame]

850

if primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear")):

851

faf_min = quantise_float32(0, ofm_quant.scale_f32, ofm_quant.zero_point)

852

faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)

853

else:

854

faf_min = quantise_float32(clamp_sigmoid(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)

855

faf_max = quantise_float32(clamp_sigmoid(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

856

elif faf == "LUT":

857

lut_index = int(activation.LUT_START.value) + primary_op.attrs.get("lut_index", 0)

858

assert lut_index <= activation.LUT_END.value, "LUT index out of range."

859

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, lut_index)

860

faf_min = ofm_quant_qmin

861

faf_max = ofm_quant_qmax

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

862

else:

863

raise Exception("Unsupported fused_activation_function = " + faf)

864

865

# Activation range needs to be set based upon the quantisation range and the fused activation range

866

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, max(ofm_quant_qmin, faf_min))

867

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, min(ofm_quant_qmax, faf_max))

868

869

out_shape = cmd.ofm_box.get_size_shape()

870

if len(out_shape) >= 4:

871

emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, out_shape[-3] - 1)

872

else:

873

emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, 0)

874

if len(out_shape) >= 2:

875

emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, out_shape[-2] - 1)

876

else:

877

emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, 0)

878

emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, out_shape[-1] - 1)

879

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

880

if npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum)):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

881

in_shape = cmd.ifm_box.get_size_shape()

882

emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, in_shape[-1] - 1)

883

else:

884

emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, out_shape[-1] - 1)

885

Jacob Bohlin

2020-04-27 10:27:25 +0200

[diff] [blame]

886

for tens, box, region_op, ptr_ops, stride_ops, zero_point_op in (

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

887

(

888

cmd.ifm_tensor,

889

cmd.ifm_box,

Jacob Bohlin

2020-04-27 10:27:25 +0200

[diff] [blame]

890

cmd0.NPU_SET_IFM_REGION,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

891

(cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3),

892

(cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X),

893

cmd0.NPU_SET_IFM_ZERO_POINT,

),

(

cmd.ifm2_tensor,

cmd.ifm2_box,

Jacob Bohlin

2020-04-27 10:27:25 +0200

[diff] [blame]

898

cmd0.NPU_SET_IFM2_REGION,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

899

(

900

cmd1.NPU_SET_IFM2_BASE0,

901

cmd1.NPU_SET_IFM2_BASE1,

902

cmd1.NPU_SET_IFM2_BASE2,

903

cmd1.NPU_SET_IFM2_BASE3,

904

),

905

(cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X),

906

cmd0.NPU_SET_IFM2_ZERO_POINT,

),

(

cmd.ofm_tensor,

cmd.ofm_box,

Jacob Bohlin

2020-04-27 10:27:25 +0200

[diff] [blame]

911

cmd0.NPU_SET_OFM_REGION,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

912

(cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3),

913

(cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X),

914

cmd0.NPU_SET_OFM_ZERO_POINT,

),

):

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

918

if tens is None:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

919

continue

920

Jacob Bohlin

9fbc491

2020-06-29 11:58:50 +0200

[diff] [blame]

921

need_zero_point = (faf is not None) or (fmf == "ConcatSliceWrite") or fused_quantize

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

922

if (

Dwight Lidman

86d4993

2020-06-04 15:31:56 +0200

[diff] [blame]

923

primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear")) and not need_zero_point

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

924

) or tens.quantization is None:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

925

# Actual integer operation, just set scale to 1 and zero point to 0

926

emit.cmd0_with_param(zero_point_op, 0)

927

else:

928

assert tens.quantization.zero_point is not None, "need an actual zero point set"

Charles Xu

9a03fdf

2020-07-02 15:12:40 +0200

[diff] [blame]

929

if (

930

"resizebilinear" in primary_op.attrs

931

and primary_op.type == "AddAct"

932

and cmd0.NPU_SET_OFM_ZERO_POINT == zero_point_op

933

):

934

# Force output zero point same as the input zero point

935

# for resizebiliner 1x1 that is converted to add

936

zero_point = cmd.ifm2_tensor.quantization.zero_point

937

else:

938

zero_point = tens.quantization.zero_point

939

emit.cmd0_with_param(zero_point_op, int(zero_point))

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

940

941

if tens.shape == []:

942

# Empty shape, elementwise constant

Louis Verhaard

c88a96f

2020-06-10 09:04:33 +0200

[diff] [blame]

943

ifm2_scalar = tens.quant_values

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

944

assert ifm2_scalar.size == 1

Louis Verhaard

c88a96f

2020-06-10 09:04:33 +0200

[diff] [blame]

945

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, int(ifm2_scalar.item(0)))

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

946

continue

947

948

height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer(

949

box.start_coord, box.end_coord

950

)

951

if npu_block_type != NpuBlockType.VectorProduct:

952

if tens == cmd.ifm_tensor:

953

emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT0_M1, height_0 - 1)

954

emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT1_M1, height_1 - 1)

955

emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, width_0 - 1)

956

elif tens == cmd.ofm_tensor:

957

emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT0_M1, height_0 - 1)

958

emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT1_M1, height_1 - 1)

959

emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, width_0 - 1)

Louis Verhaard

0cf06c7

2020-05-12 08:31:05 +0200

[diff] [blame]

960

if tens == cmd.ifm2_tensor:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

961

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT0_M1, height_0 - 1)

962

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT1_M1, height_1 - 1)

963

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_WIDTH0_M1, width_0 - 1)

964

else:

965

if len(out_shape) == 2:

966

# TODO: N is put in W-dimension for now

967

# Should be spread over H and W, but then block size selectetion,

968

# and stride calculation should be changed

969

if tens == cmd.ifm_tensor:

970

emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, out_shape[-2] - 1)

971

elif tens == cmd.ofm_tensor:

972

emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, out_shape[-2] - 1)

else:

assert False

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame]

976

emit.cmd0_with_param(region_op, base_ptr_idx_map[tens.mem_type])

Jacob Bohlin

2020-04-27 10:27:25 +0200

[diff] [blame]

977

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

978

for idx, addr in enumerate(addresses):

if addr is None:

addresses[idx] = 0

emit.cmd1_with_offset(ptr_ops[0], addresses[0])

983

emit.cmd1_with_offset(ptr_ops[1], addresses[1])

984

emit.cmd1_with_offset(ptr_ops[2], addresses[2])

985

emit.cmd1_with_offset(ptr_ops[3], addresses[3])

986

987

strides = tens.get_strides()

988

emit.cmd1_with_offset(stride_ops[0], strides[1]) # stride between 16-byte channel blocks (C)

989

emit.cmd1_with_offset(stride_ops[2], strides[3]) # stride between horisontal values (W)

990

emit.cmd1_with_offset(stride_ops[1], strides[2]) # stride between vertical values (H)

991

992

if tens.format == TensorFormat.NHCWB16:

993

# Check that all BasePointer addresses are aligned to 16 bytes

994

assert (int(addresses[0]) % 16) == 0

995

assert (int(addresses[1]) % 16) == 0

996

assert (int(addresses[2]) % 16) == 0

997

assert (int(addresses[3]) % 16) == 0

998

999

ofm_dtype = cmd.ofm_tensor.dtype

1000

assert ofm_dtype.type & BaseType.Int

1001

prec = 0

1002

if ofm_dtype.size_in_bits() == 8:

1003

prec = 0

1004

elif ofm_dtype.size_in_bits() == 16:

1005

prec = 2

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

1006

elif ofm_dtype.size_in_bits() == 32:

1007

prec = 4

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

else:

assert 0

if ofm_dtype.type & BaseType.Signed:

prec += 1

if use_global_scale:

# Set global scale bit, as opposed to using per channel scale

1016

prec |= 1 << 8

1017

1018

if cmd.ofm_tensor.format == TensorFormat.NHCWB16:

1019

prec |= 1 << 6

1020

1021

prec |= rounding_mode.value << 14

1022

1023

emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)

prec = None

weight_bits = 8

if cmd.weight_tensor is not None:

1028

weight_bits = cmd.weight_tensor.dtype.size_in_bits()

1029

1030

ifm_dtype = cmd.ifm_tensor.dtype

1031

1032

assert weight_bits == 8, "Unsupported weight bit depth"

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

1033

assert (

1034

ifm_dtype.size_in_bits() in {8, 16}

1035

or ifm_dtype.size_in_bits() == 32

1036

and npu_block_type in (NpuBlockType.ElementWise, NpuBlockType.ReduceSum)

1037

), "Unsupported ifm bit depth"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1038

1039

if ifm_dtype.size_in_bits() == 8:

1040

if ifm_dtype.type & BaseType.Signed:

Diqing Zhong

2020-04-27 10:27:34 +0200

[diff] [blame]

1041

prec = ifm_precision.S8

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1042

else:

Diqing Zhong

2020-04-27 10:27:34 +0200

[diff] [blame]

1043

prec = ifm_precision.U8

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1044

elif ifm_dtype.size_in_bits() == 16:

1045

if ifm_dtype.type & BaseType.Signed:

Diqing Zhong

2020-04-27 10:27:34 +0200

[diff] [blame]

1046

prec = ifm_precision.S16

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1047

else:

Diqing Zhong

2020-04-27 10:27:34 +0200

[diff] [blame]

1048

prec = ifm_precision.U16

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

1049

elif ifm_dtype == DataType.int32:

1050

prec = ifm_precision.S32

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1051

1052

ifm_prec = prec.value

1053

ifm2_prec = ifm_prec

1054

1055

if cmd.ifm_tensor.format == TensorFormat.NHCWB16:

1056

ifm_prec |= 1 << 6

1057

1058

ifm_prec |= op_to_scale << 8

1059

1060

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PRECISION, ifm_prec)

1061

1062

if cmd.ifm2_tensor is not None:

1063

if cmd.ifm2_tensor.format == TensorFormat.NHCWB16:

1064

ifm2_prec |= 1 << 6

1065

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_PRECISION, ifm2_prec)

1066

1067

emit_wait_commands(cmd)

1068

1069

# Get op parameters

1070

cur_ifm_block_depth = get_op_ifmofm_block_depth(arch, cmd)

1071

cur_ofm_block = Block(ps.block_config[1], ps.block_config[0], ps.block_config[3])

1072

cur_ofm_rect = get_op_ofm_rect(cmd)

1073

cur_ifm_rect = get_op_ifm_rect(cmd)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1074

cur_padLT = get_op_padding_lt(cmd)

1075

if (prev_kernel is not None) and (cur_kernel is not None) and has_prev_op_dependency(prev_cmd, cmd):

1076

if cmd.ifm_tensor.shape == prev_cmd.ofm_tensor.shape:

1077

blockdep = arch.calc_block_dep(

1078

prev_ifm_rect,

1079

prev_ofm_rect,

1080

prev_ifm_block_depth,

prev_ofm_block,

prev_kernel,

cur_ifm_rect,

cur_ofm_rect,

cur_ifm_block_depth,

cur_ofm_block,

cur_kernel,

cur_padLT,

)

else:

blockdep = 0

else:

blockdep = ArchitectureFeatures.MAX_BLOCKDEP

1094

1095

# Set between every op (dependent or not)

1096

blockdep = min(blockdep, arch.max_blockdep)

1097

emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)

1098

prev_cmd = cmd

1099

1100

if npu_block_type == NpuBlockType.ConvolutionMxN:

1101

emit.cmd_do_operation(cmd0.NPU_OP_CONV)

1102

elif npu_block_type == NpuBlockType.ConvolutionDepthWise:

1103

emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)

1104

elif npu_block_type == NpuBlockType.VectorProduct:

1105

# Vector product is implemented using a 1x1 convolution

1106

emit.cmd_do_operation(cmd0.NPU_OP_CONV)

1107

elif npu_block_type == NpuBlockType.Pooling:

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

1108

param = pooling_mode.MAX.value if "Max" in primary_op.type else pooling_mode.AVERAGE.value

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1109

emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=param)

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

1110

elif npu_block_type == NpuBlockType.ReduceSum:

1111

emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_mode.REDUCE_SUM.value)

Tim Hall