Blame - ethosu/vela/register_command_stream_generator.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

1

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

16

# Description:

17

# Register level (low-level) command stream generation for Ethos-U55. Takes a high-level command stream and generates

18

# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit

19

# stream suitable for interpretation by the Ethos-U55 processor.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

20

from collections import defaultdict

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

21

from enum import Enum

22

from enum import IntEnum

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

import numpy as np

from . import scaling

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

27

from .architecture_features import ArchitectureFeatures

28

from .architecture_features import Block

29

from .architecture_features import Kernel

30

from .architecture_features import Rect

31

from .architecture_features import SharedBufferArea

32

from .architecture_features import SHRAMElements

33

from .data_type import BaseType

34

from .data_type import DataType

35

from .ethos_u55_regs.ethos_u55_regs import acc_format

36

from .ethos_u55_regs.ethos_u55_regs import activation

37

from .ethos_u55_regs.ethos_u55_regs import cmd0

38

from .ethos_u55_regs.ethos_u55_regs import cmd1

39

from .ethos_u55_regs.ethos_u55_regs import elementwise_mode

40

from .ethos_u55_regs.ethos_u55_regs import ifm_precision

Jacob Bohlin

cf7da10

2020-05-20 09:03:40 +0200

[diff] [blame]

41

from .ethos_u55_regs.ethos_u55_regs import resampling_mode

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

42

from .ethos_u55_regs.ethos_u55_regs import rounding

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

43

from .high_level_command_stream import CommandType

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

44

from .numeric_util import clamp_sigmoid

45

from .numeric_util import clamp_tanh

Louis Verhaard

2020-06-04 15:51:24 +0200

[diff] [blame]

46

from .numeric_util import full_shape

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

47

from .numeric_util import quantise_float32

48

from .numeric_util import round_away_zero

49

from .numeric_util import round_up

50

from .numeric_util import round_up_to_int

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

51

from .operation import NpuBlockType

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

52

from .shared_buffer_allocation import SharedBufferAllocation

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame^]

53

from .tensor import MemType

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

54

from .tensor import TensorBlockTraversal

55

from .tensor import TensorFormat

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

56

57

58

class RegisterMachine:

59

def __init__(self):

60

self.n_banks = 1

61

self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]

62

self.bank_idx = 0

63

64

def set_register(self, reg, value):

65

is_changed = self.registers[self.bank_idx][reg] != value

66

self.registers[self.bank_idx][reg] = value

67

# is_changed = True # force command

68

return is_changed

69

70

def switch_bank(self):

71

self.bank_idx = (self.bank_idx + 1) % self.n_banks

72

73

74

class CmdMode(IntEnum):

NoPayload = 0x0000

Payload32 = 0x4000

Mask = 0xC000

CmdOpMask = 0x03FF

class BasePointerIndex(IntEnum):

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame^]

82

WeightTensor = 0 # base address index for the Weight tensor

83

ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena

84

ScratchFastTensor = 2 # base address for the Scratch_fast_tensor

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

85

86

87

# TODO: Replace with definitions from ethos_u55_regs

88

class IFM2Broadcast(IntEnum):

89

BroadcastHdim = 1 << 0

90

BroadcastWdim = 1 << 1

91

BroadcastCdim = 1 << 2

92

ReverseOperandOrder = 1 << 6

93

UseIFM2Scalar = 1 << 7

94

95

96

class CommandStreamEmitter:

97

def __init__(self):

98

self.cmd_stream = []

99

self.reg_machine = [RegisterMachine(), RegisterMachine()]

100

self.last_absolute_wait = defaultdict(int)

101

102

def get_reg_machine(self, cmd):

103

if "DMA" in cmd.name:

104

return self.reg_machine[1]

105

else:

106

return self.reg_machine[0]

107

108

def size_in_bytes(self):

109

sz = 0

110

for cmd in self.cmd_stream:

sz += len(cmd) * 4

return sz

def to_list(self):

return [elem for cmd in self.cmd_stream for elem in cmd]

116

117

def print_cmds(self):

118

print("Code: Command: Param: Payload:")

119

for words_for_one_command in self.cmd_stream:

120

code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits

121

param = words_for_one_command[0] >> 16 # higher 16 bits

122

123

payload_mode = CmdMode(code & CmdMode.Mask)

124

125

# code and command

126

s = " 0x%04x " % code

127

if payload_mode == CmdMode.NoPayload:

128

s += str(cmd0(code & CmdMode.CmdOpMask))

129

else:

130

s += str(cmd1(code & CmdMode.CmdOpMask))

s = s.ljust(40)

s += "%5d" % param

# payload

if payload_mode == CmdMode.Payload32:

137

s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])

else:

s += " -"

print(s)

def cmd0_with_param(self, cmd, param):

144

if isinstance(param, Enum):

145

param = int(param.value)

146

else:

147

param = int(param)

148

param = param & 0xFFFF

149

command = cmd.value | (param << 16)

150

if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):

151

return

152

153

# This is not a redundant command, actually write it

154

self.cmd_stream.append((command,))

155

156

def cmd1_with_offset(self, cmd, offset, param=0x0):

157

offset = int(offset) & 0xFFFFFFFFF

158

command = cmd.value | CmdMode.Payload32.value | (param << 16)

159

160

if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):

161

return

162

163

# This is not a redundant command, actually write it

164

self.cmd_stream.append((command, offset))

165

166

def cmd_wait(self, cmd, param, absolute_wait_time):

167

if absolute_wait_time <= self.last_absolute_wait[cmd]:

168

return

169

170

self.last_absolute_wait[cmd] = absolute_wait_time

171

param = int(param)

172

command = ((param & 0xFFFF) << 16) | cmd.value

173

self.cmd_stream.append((command,))

174

175

def cmd_do_operation(self, cmd, param=0):

176

param = int(param)

177

command = ((param & 0xFFFF) << 16) | cmd.value

178

179

self.cmd_stream.append((command,))

180

self.get_reg_machine(cmd).switch_bank()

181

182

183

def calc_command_dependencies(cmd_stream, arch):

cmd_starts = {}

cmd_ends = {}

memory_accesses = {}

# Keep track of accumulated number of commands in command stream.

189

# First element kernel ops: (# of blocks, # of commands)

190

# Second element DMA ops: (# of commands)

191

pos = np.array((np.array((0, 0)), np.array([0])))

dependencies = {}

for cmd in cmd_stream:

196

cmd_starts[cmd] = pos

197

op_count = cmd.get_operation_count()

198

# Keep track of both num blocks and commands

199

cmd_add = 0 if (op_count[0] == 0) else 1

200

pos = np.array((pos[0] + np.array((op_count[0], cmd_add)), pos[1] + np.array([op_count[1]])))

201

cmd_ends[cmd] = np.array((pos[0], pos[1]))

202

memory_accesses[cmd] = cmd.get_memory_accesses()

203

204

for idx, cmd in enumerate(cmd_stream):

205

curr_accesses = memory_accesses[cmd]

206

# Keep track of command dependency.

207

# First element kernel ops: (# of blocks, # of commands)

208

# Second element DMA ops: (# of commands)

209

dep_offsets = np.array((np.array((-1, -1)), np.array([-1])))

210

dep_cmds = [None] * CommandType.Size.value

211

if idx > 0:

212

# Look at the previous commands in backwards order

213

for prev_cmd in cmd_stream[idx - 1 :: -1]:

214

assert prev_cmd is not cmd

215

if dep_cmds[prev_cmd.cmdtype] is None:

216

is_dependency = False

217

if cmd.cmdtype == CommandType.NpuStripe and prev_cmd.cmdtype == CommandType.NpuStripe:

218

# Special handling here, as dpu -> dpu operations require additional care

219

if not SharedBufferAllocation.is_compatible(prev_cmd.ps.shared_buffer, cmd.ps.shared_buffer):

220

is_dependency = True

221

elif memory_accesses[prev_cmd].conflicts(curr_accesses):

222

is_dependency = True

223

else:

224

if memory_accesses[prev_cmd].conflicts(curr_accesses):

is_dependency = True

if is_dependency:

new_offset = cmd_ends[prev_cmd][prev_cmd.cmdtype]

229

if new_offset[0] > dep_offsets[prev_cmd.cmdtype][0]:

230

dep_cmds[prev_cmd.cmdtype] = prev_cmd

231

dep_offsets[prev_cmd.cmdtype] = new_offset

232

233

# Check if we've got dependencies for all commands, in which case we can early out

for dep in dep_cmds:

if dep is None:

break

else:

break # all handled

# Convert absolute to relative dependencies, using None to signal the special case of no

241

# dependency of this kind

242

res = [None] * CommandType.Size.value

243

for i in range(CommandType.Size.value):

244

if dep_cmds[i] is not None:

245

res[i] = cmd_starts[cmd][i] - dep_offsets[i]

246

247

dependencies[cmd] = cmd_starts[cmd], res

return dependencies

def get_op_kernel(ps):

253

if ps.primary_op is None:

254

return None

255

256

strides = ps.primary_op.attrs.get("strides", (1, 1, 1, 1))

257

dilation = ps.primary_op.attrs.get("dilation", (1, 1, 1, 1))

258

if ps.weight_tensor:

259

if ps.npu_block_type in set((NpuBlockType.VectorProduct, NpuBlockType.ElementWise)):

k_h = 1

k_w = 1

else:

k_h = ps.weight_tensor.shape[0]

264

k_w = ps.weight_tensor.shape[1]

265

else:

266

k_h = ps.primary_op.attrs.get("filter_height", 1)

267

k_w = ps.primary_op.attrs.get("filter_width", 1)

268

269

return Kernel(k_w, k_h, strides[2], strides[1], dilation[2], dilation[1])

270

271

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

272

def has_prev_op_dependency(prev_cmd, cmd):

273

if prev_cmd is None:

274

return False

275

if (prev_cmd.cmdtype == cmd.cmdtype == CommandType.NpuStripe) and (prev_cmd.ps != cmd.ps):

Tim Hall

9033795

2020-05-07 16:42:35 +0100

[diff] [blame]

276

if prev_cmd.ofm_tensor.equivalence_id == cmd.ifm_tensor.equivalence_id:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

277

return True

Tim Hall

9033795

2020-05-07 16:42:35 +0100

[diff] [blame]

278

elif cmd.ifm2_tensor is not None:

Louis Verhaard

2020-06-04 15:51:24 +0200

[diff] [blame]

279

return prev_cmd.ofm_tensor.equivalence_id == cmd.ifm2_tensor.equivalence_id

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

return False

def get_op_ofm_rect(cmd):

Charles Xu

3e9c434

2020-04-22 08:31:43 +0200

[diff] [blame]

284

start = full_shape(4, cmd.ofm_box.start_coord, 0)

285

end = full_shape(4, cmd.ofm_box.end_coord, 1)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

286

return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1)

287

288

289

def get_op_ifm_rect(cmd):

Charles Xu

3e9c434

2020-04-22 08:31:43 +0200

[diff] [blame]

290

start = full_shape(4, cmd.ifm_box.start_coord, 0)

291

end = full_shape(4, cmd.ifm_box.end_coord, 1)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

292

return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1)

293

294

295

def get_op_ifmofm_block_depth(arch, cmd):

296

# Note: NOT equivalent to the normal ifm block depth calculation since

297

# it takes into account 'depthless' block operations by returning full

298

# depth

299

if cmd.ps.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling, NpuBlockType.ElementWise):

300

return cmd.ofm_box.get_size_shape()[-1]

301

302

return arch.calc_ifm_block_depth(cmd.ifm_box.get_size_shape()[-1], cmd.ifm_tensor.dtype.bits)

303

304

305

def get_op_padding_lt(cmd):

306

if cmd.ps.npu_block_type not in (

307

NpuBlockType.ConvolutionDepthWise,

308

NpuBlockType.Pooling,

309

NpuBlockType.ConvolutionMxN,

):

return (0, 0)

explicit_padding = list(cmd.ps.primary_op.attrs["explicit_padding"]) # (top, left, bottom, right)

314

315

# Check if this is for horizontal ifm streaming

316

if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):

317

explicit_padding[0] = cmd.pad_top

318

explicit_padding[2] = cmd.pad_bottom

319

320

return (explicit_padding[1], explicit_padding[0])

321

322

323

def generate_register_command_stream(nng, sg, arch, verbose=False):

324

emit = CommandStreamEmitter()

325

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame^]

326

if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area:

327

base_ptr_idx_map = {

328

MemType.Permanent_NPU: BasePointerIndex.WeightTensor,

329

MemType.Permanent_CPU: BasePointerIndex.WeightTensor,

330

MemType.Scratch: BasePointerIndex.ScratchTensor,

331

MemType.Scratch_fast: BasePointerIndex.ScratchTensor,

}

else:

base_ptr_idx_map = {

MemType.Permanent_NPU: BasePointerIndex.WeightTensor,

336

MemType.Permanent_CPU: BasePointerIndex.WeightTensor,

337

MemType.Scratch: BasePointerIndex.ScratchTensor,

338

MemType.Scratch_fast: BasePointerIndex.ScratchFastTensor,

339

}

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

340

341

# Maps an AccumulatorType enum to the corresponding acc_format value

342

acc_format_map = {

343

SHRAMElements.Acc16: acc_format.FP_S5_10.value,

344

SHRAMElements.Acc32: acc_format.INT_32BIT.value,

345

SHRAMElements.Acc40: acc_format.INT_40BIT.value,

346

}

347

348

# Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE

349

elementwise_mode_map = {

350

"MulAct": elementwise_mode.MUL.value,

351

"AddAct": elementwise_mode.ADD.value,

352

"SubAct": elementwise_mode.SUB.value,

353

"Minimum": elementwise_mode.MIN.value,

354

"Maximum": elementwise_mode.MAX.value,

355

"LeakyRelu": elementwise_mode.LRELU.value,

356

"Abs": elementwise_mode.ABS.value,

}

cmd_stream = []

for cmd in sg.high_level_command_stream:

361

if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default:

362

print("Warning: Skipping register command stream generation for", cmd.ps)

363

else:

364

cmd_stream.append(cmd)

365

366

dependencies = calc_command_dependencies(cmd_stream, arch)

367

368

# Initialise operator dependency state

369

prev_ifm_rect = cur_ifm_rect = None

370

prev_ifm_block_depth = cur_ifm_block_depth = None

371

prev_ofm_rect = cur_ofm_rect = None

372

prev_ofm_block = cur_ofm_block = None

373

prev_kernel = cur_kernel = None

374

prev_cmd = None

375

376

def emit_wait_commands(cmd):

377

# The command is fully set up, emit whatever wait commands we need

378

absolute_dep, relative_dep = dependencies[cmd]

379

if relative_dep[CommandType.NpuStripe] is not None:

380

if cmd.cmdtype == CommandType.DMA:

381

param = relative_dep[CommandType.NpuStripe][1]

382

if param <= 3:

383

emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, param, absolute_dep[CommandType.NpuStripe][1])

384

else:

385

param = relative_dep[CommandType.NpuStripe][0]

386

param = min(param, 0xFFFF) # Clamp to allowable wait amount

387

388

if relative_dep[CommandType.DMA] is not None:

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame^]

389

# TODO This can be optimized for yoda

390

param = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

391

emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, param, absolute_dep[CommandType.DMA][0])

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

392

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

393

for cmd in cmd_stream:

394

if cmd.cmdtype == CommandType.DMA:

395

start_coord = cmd.box.start_coord

396

397

src_addr = cmd.in_tensor.address_for_coordinate(start_coord)

398

dst_addr = cmd.out_tensor.address_for_coordinate(start_coord)

399

400

if cmd.in_tensor.compressed_values is not None:

401

stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord)

402

sz = cmd.in_tensor.size_of_compressed_stream(stream_index)

403

else:

404

sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr

405

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame^]

406

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, base_ptr_idx_map[cmd.in_tensor.mem_type])

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

407

emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, src_addr)

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame^]

408

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, base_ptr_idx_map[cmd.out_tensor.mem_type])

409

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

410

emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dst_addr)

411

emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, sz)

412

dma_channel = 0

413

mode = 0 # From external to external

414

415

emit_wait_commands(cmd)

416

emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, dma_channel * 16 + mode)

417

418

elif cmd.cmdtype == CommandType.NpuStripe:

419

420

ps = cmd.ps

421

primary_op = ps.primary_op

422

npu_block_type = ps.npu_block_type

423

# Specifies if global scale from the NPU_SET_OFM_SCALE register should be used instead of per-channel scale

424

use_global_scale = False

425

# Specifies type of rounding to be used.

426

rounding_mode = rounding.TFL

Louis Verhaard

2020-06-04 15:51:24 +0200

[diff] [blame]

427

if primary_op.type == "ResizeBilinear":

Dwight Lidman

2020-04-30 11:54:48 +0200

[diff] [blame]

428

rounding_mode = rounding.TRUNCATE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

429

fmf = primary_op.attrs.get("fused_memory_function", None)

430

faf = primary_op.attrs.get("fused_activation_function", None)

431

432

# Specifies which operand to apply scaling to in bitexact elementwise ADD/SUB

433

op_to_scale = 0

434

435

# Update state history

436

prev_ifm_rect = cur_ifm_rect

437

prev_ifm_block_depth = cur_ifm_block_depth

438

prev_ofm_rect = cur_ofm_rect

439

prev_ofm_block = cur_ofm_block

440

prev_kernel = cur_kernel

Louis Verhaard

2020-06-04 15:51:24 +0200

[diff] [blame]

441

cur_kernel = get_op_kernel(ps)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

442

443

block_config = ps.block_config

444

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config[0] - 1)

445

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config[1] - 1)

446

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config[3] - 1)

447

448

shared_buffer = ps.shared_buffer

449

450

if npu_block_type == NpuBlockType.ElementWise:

451

ifm2_broadcast = 0

452

453

if cmd.ifm_tensor.shape == []:

454

# The scalar has to be the ifm2 tensor so switch the ifms

455

cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor

456

cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box

457

458

# Set ReverseOperandOrder bit to IFM2_BROADCAST

459

ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder

460

461

# Calculate scales needed for arithmetic elementwise operators

462

if primary_op.type in set(("AddAct", "MulAct", "SubAct",)):

463

input_scale = cmd.ifm_tensor.quantization.scale_f32

464

input2_scale = cmd.ifm2_tensor.quantization.scale_f32

465

output_scale = cmd.ofm_tensor.quantization.scale_f32

466

use_global_scale = True

467

468

if primary_op.type == "MulAct":

469

if (faf == "Sigmoid") or (faf == "Tanh"):

470

output_scale = 1 / 0x3000

471

472

ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)

473

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

474

else: # AddAct/SubAct

475

if (faf == "Sigmoid") or (faf == "Tanh"):

476

output_scale = 1 / 0x3000

477

478

if input_scale == input2_scale:

479

opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(

480

input_scale, input2_scale, output_scale

481

)

482

opa_shift = 0 # Unused for this case

483

else:

484

# Use advanced implementation only when input scales differ

485

bitdepth = cmd.ifm_tensor.dtype.bits

(

opa_scale,

opa_shift,

ofm_scale,

shift,

op_to_scale,

) = scaling.advanced_elementwise_add_sub_scale(

493

input_scale, input2_scale, output_scale, bitdepth

494

)

495

opb_scale = 0 # Unused for this case

496

if ifm2_broadcast & IFM2Broadcast.ReverseOperandOrder:

497

# If the operand order is reversed we also have to swap which operand is scaled

498

if op_to_scale == scaling.OperandToScale.OPa:

499

op_to_scale = scaling.OperandToScale.OPb

500

else:

501

op_to_scale = scaling.OperandToScale.OPa

502

503

emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)

504

emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)

505

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

506

507

if primary_op.type in set(("LeakyRelu", "Abs",)):

508

output_scale = cmd.ofm_tensor.quantization.scale_f32

509

use_global_scale = True

510

511

if primary_op.type == "LeakyRelu":

512

output_scale *= primary_op.attrs["alpha"]

513

514

ofm_scale, shift = scaling.quantise_scale(output_scale)

515

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

516

517

# For elementwise set the required SHRAM to be equal to the total size of SHRAM

518

shram_required = arch.shram_total_banks

519

emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)

520

521

# Acc buffers not needed so set AB_START to size of SHRAM

522

emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch.shram_total_banks)

523

524

# Is not a unary operator

525

if cmd.ifm2_tensor is not None:

526

if cmd.ifm2_tensor.shape == []:

527

# IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST

528

ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar

529

else:

530

ifm_box_shape = cmd.ifm_box.get_size_shape()

531

ifm2_box_shape = cmd.ifm2_box.get_size_shape()

532

533

if len(cmd.ifm_tensor.shape) > 1 and ifm_box_shape[1] != ifm2_box_shape[1]:

534

# Broadcast in 'H' dimension

535

assert cmd.ifm2_tensor.shape[1] == 1

536

ifm2_broadcast |= IFM2Broadcast.BroadcastHdim

537

538

if len(cmd.ifm_tensor.shape) > 2 and ifm_box_shape[2] != ifm2_box_shape[2]:

539

# Broadcast in 'W' dimension

540

assert cmd.ifm2_tensor.shape[2] == 1

541

ifm2_broadcast |= IFM2Broadcast.BroadcastWdim

542

543

if len(cmd.ifm_tensor.shape) > 3 and ifm_box_shape[3] != ifm2_box_shape[3]:

544

# Broadcast in 'C' dimension

545

assert cmd.ifm2_tensor.shape[3] == 1

546

ifm2_broadcast |= IFM2Broadcast.BroadcastCdim

547

548

# Set IFM2_IB_START to the latter half of the IB space

549

ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]

550

emit.cmd0_with_param(

551

cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) / 2 + ifm_ib_start

552

)

553

554

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)

555

556

else:

557

emit.cmd0_with_param(

558

cmd0.NPU_SET_IFM_IB_END,

559

shared_buffer.bank_locations[SharedBufferArea.IFM]

560

+ shared_buffer.banks_required[SharedBufferArea.IFM],

561

)

562

emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])

563

564

emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])

565

Louis Verhaard

2020-06-04 15:51:24 +0200

[diff] [blame]

566

if primary_op.type == "ResizeBilinear":

Dwight Lidman

2020-04-30 11:54:48 +0200

[diff] [blame]

567

# perform nearest neighbor upscale

Jacob Bohlin

cf7da10

2020-05-20 09:03:40 +0200

[diff] [blame]

568

emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.NEAREST)

569

elif primary_op.type == "Conv2DBackpropInputSwitchedBias":

570

# perform insert zero upscale

571

emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.TRANSPOSE)

Dwight Lidman

2020-04-30 11:54:48 +0200

[diff] [blame]

572

else:

Jacob Bohlin

cf7da10

2020-05-20 09:03:40 +0200

[diff] [blame]

573

emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.NONE)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

574

575

if npu_block_type in set(

576

(NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling)

577

):

578

# Set up padding

579

explicit_padding = list(primary_op.attrs["explicit_padding"]) # (top, left, bottom, right)

580

581

# Check if this is for horizontal ifm streaming

582

if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):

583

explicit_padding[0] = cmd.pad_top

584

explicit_padding[2] = cmd.pad_bottom

585

586

# Indexing from end since a 1x1 Avgpool might have been added with non 4-dimensional input/output,

587

# because of activation function needed to be fused.

588

if cmd.ifm_box.start_coord[-2] > 0:

589

explicit_padding[1] = 0

590

if cmd.ifm_box.end_coord[-2] < cmd.ifm_tensor.shape[-2]:

591

explicit_padding[3] = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

592

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, explicit_padding[0])

593

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, explicit_padding[1])

594

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, explicit_padding[2])

595

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, explicit_padding[3])

596

Dwight Lidman

0538a77

2020-05-06 14:09:17 +0200

[diff] [blame]

597

# set kernel x stride low bit

598

stride = primary_op.attrs["strides"][2] - 1 & 1

599

# set kernel y stride low bit

600

stride |= (primary_op.attrs["strides"][1] - 1 & 1) << 1

601

# set kernel x stride extension bits

602

stride |= (primary_op.attrs["strides"][2] - 1 >> 1) << 6

603

# set kernel y stride extension bits

604

stride |= (primary_op.attrs["strides"][1] - 1 >> 1) << 9

605

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

606

if npu_block_type == NpuBlockType.Pooling:

607

k_height, k_width = primary_op.attrs["ksize"][1:3]

608

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, k_height - 1)

609

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, k_width - 1)

610

611

valid_padding = sum(explicit_padding) == 0

612

Dwight Lidman

2020-04-30 11:54:48 +0200

[diff] [blame]

613

if primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear")) and valid_padding:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

614

# For valid padding vela has to output scaling values

615

if faf == "Sigmoid" or faf == "Tanh":

616

rescale = 0x3000 * cmd.ifm_tensor.quantization.scale_f32

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

617

Fredrik Svedberg

2020-05-19 10:43:01 +0200

[diff] [blame]

618

if cmd.ifm_tensor.dtype == DataType.int16:

Charles Xu

749d921

2020-06-11 12:39:19 +0200

[diff] [blame]

619

multiplier = max(1, int(4096 * cmd.ifm_tensor.quantization.scale_f32 + 0.5))

Fredrik Svedberg

2020-05-19 10:43:01 +0200

[diff] [blame]

620

rescale *= 3 * multiplier

621

622

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

623

scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)

Fredrik Svedberg

2020-05-19 10:43:01 +0200

[diff] [blame]

624

625

if cmd.ifm_tensor.dtype == DataType.int16:

626

scale = (1 << shift) * 3 * multiplier

627

else:

628

scale = int(round_away_zero(scale * rescale))

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

629

else:

630

# In case avg pool fused with concat or other memory operation, rescaling might be needed.

631

# k_height == k_width == 1 is allways true in this case

632

# Normally the scale is maximised, to get maximum precision, which means that

633

# if rescale != 1, scale need to consider the number of bits needed for rescaling

634

rescale = cmd.ifm_tensor.quantization.scale_f32 / cmd.ofm_tensor.quantization.scale_f32

635

rescale_bits = 0

636

if k_height == k_width == 1:

637

if fmf == "ConcatSliceWrite":

638

rounding_mode = rounding.NATURAL

639

if rescale > 1:

640

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

641

elif rescale < 1:

642

rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)

643

scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)

644

scale = int(round_away_zero(scale * rescale))

645

646

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)

647

# Valid-padded average pool should use the global scale from

648

# NPU_SET_OFM_SCALE register, which is set above.

649

use_global_scale = True

650

651

else: # Convolution

652

assert cmd.weight_tensor.block_traversal != TensorBlockTraversal.Default

Fredrik Svedberg

d67c0aa

2020-03-30 13:15:28 +0200

[diff] [blame]

653

# Reduced precision quantization and natural rounding used for int16

654

if cmd.ifm_tensor.dtype == DataType.int16:

655

rounding_mode = rounding.NATURAL

Louis Verhaard

2020-06-04 15:51:24 +0200

[diff] [blame]

656

stride |= (cur_kernel.dilation.y - 1) << 4

657

stride |= (cur_kernel.dilation.x - 1) << 3

658

emit.cmd0_with_param(

659

cmd0.NPU_SET_KERNEL_HEIGHT_M1, cur_kernel.dilation.y * (cmd.weight_tensor.shape[0] - 1)

660

)

661

emit.cmd0_with_param(

662

cmd0.NPU_SET_KERNEL_WIDTH_M1, cur_kernel.dilation.x * (cmd.weight_tensor.shape[1] - 1)

663

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

664

if cmd.weight_tensor.block_traversal == TensorBlockTraversal.PartKernelFirst:

665

# Part-kernel-first weight ordering

666

assert npu_block_type == NpuBlockType.ConvolutionMxN

667

stride |= 1 << 2

668

669

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)

670

671

elif npu_block_type in set((NpuBlockType.VectorProduct,)):

672

# Vector product is implemented using a 1x1 convolution so need

673

# to setup the appropriate padding and kernel info

674

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, 0)

675

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, 0)

676

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, 0)

677

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, 0)

678

679

# kernel stride reg = 0 means stride(1,1) + depth first weight

680

# order + dilation(0,0) + kernel_split_size=8

681

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, 0)

682

683

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, 0)

684

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, 0)

685

686

if npu_block_type in set(

687

(NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)

688

):

689

# Emit Weight base address commands, only maps the area required for

690

# this command's weights from the larger tensor.

691

stream_index = cmd.weight_tensor.compressed_stream_index_from_coord(cmd.weight_box.start_coord)

692

weight_addr = cmd.weight_tensor.address_for_coordinate(cmd.weight_box.start_coord)

693

weight_len = cmd.weight_tensor.size_of_compressed_stream(stream_index)

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame^]

694

weight_region = base_ptr_idx_map[cmd.weight_tensor.mem_type]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

695

emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weight_region)

696

emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT_BASE, weight_addr)

697

emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT_LENGTH, weight_len)

698

699

# Emit Scale & Bias base address commands, with length matching the amount required by

700

# the weight tensors.

701

if cmd.scale_tensor is not None:

702

# Get address and size of the scale/bias data area

703

scale_addr = cmd.scale_tensor.address_for_coordinate(cmd.weight_box.start_coord[-1:])

704

scale_len = (

705

cmd.scale_tensor.address_for_coordinate(cmd.weight_box.end_coord[-1:], True) - scale_addr

706

)

707

# Emit base address for NPU to access scale & bias data

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame^]

708

scale_region = base_ptr_idx_map[cmd.scale_tensor.mem_type]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

709

emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, scale_region)

710

emit.cmd1_with_offset(cmd1.NPU_SET_SCALE_BASE, scale_addr)

711

emit.cmd1_with_offset(cmd1.NPU_SET_SCALE_LENGTH, round_up(scale_len, 16))

712

713

ofm_quant = cmd.ofm_tensor.quantization

714

ofm_quant_qmin = cmd.ofm_tensor.quantization.quant_min

715

ofm_quant_qmax = cmd.ofm_tensor.quantization.quant_max

716

ifm_min = cmd.ifm_tensor.quantization.min

717

ifm_max = cmd.ifm_tensor.quantization.max

718

719

# Emit commands for any fused activation function

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

720

if faf is None:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

721

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)

722

# Even if no activation function, values need to be set to override previous values

723

faf_min = ofm_quant_qmin

724

faf_max = ofm_quant_qmax

725

elif faf == "Relu":

726

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)

727

faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)

728

faf_max = ofm_quant_qmax

729

elif faf == "Relu6":

730

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)

731

faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)

732

faf_max = quantise_float32(6.0, ofm_quant.scale_f32, ofm_quant.zero_point)

733

elif faf == "ReluN1To1":

734

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)

735

faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point)

736

faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)

737

elif faf == "Tanh":

738

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.TANH)

Fredrik Svedberg

2020-05-19 10:43:01 +0200

[diff] [blame]

739

if primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear")):

740

faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point)

741

faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)

742

else:

743

faf_min = quantise_float32(clamp_tanh(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)

744

faf_max = quantise_float32(clamp_tanh(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

745

elif faf == "Sigmoid":

746

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.SIGMOID)

Fredrik Svedberg

2020-05-19 10:43:01 +0200

[diff] [blame]

747

if primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear")):

748

faf_min = quantise_float32(0, ofm_quant.scale_f32, ofm_quant.zero_point)

749

faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)

750

else:

751

faf_min = quantise_float32(clamp_sigmoid(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)

752

faf_max = quantise_float32(clamp_sigmoid(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

753

else:

754

raise Exception("Unsupported fused_activation_function = " + faf)

755

756

# Activation range needs to be set based upon the quantisation range and the fused activation range

757

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, max(ofm_quant_qmin, faf_min))

758

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, min(ofm_quant_qmax, faf_max))

759

760

out_shape = cmd.ofm_box.get_size_shape()

761

if len(out_shape) >= 4:

762

emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, out_shape[-3] - 1)

763

else:

764

emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, 0)

765

if len(out_shape) >= 2:

766

emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, out_shape[-2] - 1)

767

else:

768

emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, 0)

769

emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, out_shape[-1] - 1)

770

771

if npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct)):

772

in_shape = cmd.ifm_box.get_size_shape()

773

emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, in_shape[-1] - 1)

774

else:

775

emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, out_shape[-1] - 1)

776

Jacob Bohlin

2020-04-27 10:27:25 +0200

[diff] [blame]

777

for tens, box, region_op, ptr_ops, stride_ops, zero_point_op in (

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

778

(

779

cmd.ifm_tensor,

780

cmd.ifm_box,

Jacob Bohlin

2020-04-27 10:27:25 +0200

[diff] [blame]

781

cmd0.NPU_SET_IFM_REGION,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

782

(cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3),

783

(cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X),

784

cmd0.NPU_SET_IFM_ZERO_POINT,

),

(

cmd.ifm2_tensor,

cmd.ifm2_box,

Jacob Bohlin

2020-04-27 10:27:25 +0200

[diff] [blame]

789

cmd0.NPU_SET_IFM2_REGION,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

790

(

791

cmd1.NPU_SET_IFM2_BASE0,

792

cmd1.NPU_SET_IFM2_BASE1,

793

cmd1.NPU_SET_IFM2_BASE2,

794

cmd1.NPU_SET_IFM2_BASE3,

795

),

796

(cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X),

797

cmd0.NPU_SET_IFM2_ZERO_POINT,

),

(

cmd.ofm_tensor,

cmd.ofm_box,

Jacob Bohlin

2020-04-27 10:27:25 +0200

[diff] [blame]

802

cmd0.NPU_SET_OFM_REGION,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

803

(cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3),

804

(cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X),

805

cmd0.NPU_SET_OFM_ZERO_POINT,

),

):

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

809

if tens is None:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

810

continue

811

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

812

need_zero_point = (faf is not None) or (fmf == "ConcatSliceWrite")

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

813

if (

Dwight Lidman

86d4993

2020-06-04 15:31:56 +0200

[diff] [blame]

814

primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear")) and not need_zero_point

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

815

) or tens.quantization is None:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

816

# Actual integer operation, just set scale to 1 and zero point to 0

817

emit.cmd0_with_param(zero_point_op, 0)

818

else:

819

assert tens.quantization.zero_point is not None, "need an actual zero point set"

820

emit.cmd0_with_param(zero_point_op, int(tens.quantization.zero_point))

821

822

if tens.shape == []:

823

# Empty shape, elementwise constant

Louis Verhaard

c88a96f

2020-06-10 09:04:33 +0200

[diff] [blame]

824

ifm2_scalar = tens.quant_values

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

825

assert ifm2_scalar.size == 1

Louis Verhaard

c88a96f

2020-06-10 09:04:33 +0200

[diff] [blame]

826

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, int(ifm2_scalar.item(0)))

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

827

continue

828

829

height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer(

830

box.start_coord, box.end_coord

831

)

832

if npu_block_type != NpuBlockType.VectorProduct:

833

if tens == cmd.ifm_tensor:

834

emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT0_M1, height_0 - 1)

835

emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT1_M1, height_1 - 1)

836

emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, width_0 - 1)

837

elif tens == cmd.ofm_tensor:

838

emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT0_M1, height_0 - 1)

839

emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT1_M1, height_1 - 1)

840

emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, width_0 - 1)

Louis Verhaard

0cf06c7

2020-05-12 08:31:05 +0200

[diff] [blame]

841

if tens == cmd.ifm2_tensor:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

842

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT0_M1, height_0 - 1)

843

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT1_M1, height_1 - 1)

844

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_WIDTH0_M1, width_0 - 1)

845

else:

846

if len(out_shape) == 2:

847

# TODO: N is put in W-dimension for now

848

# Should be spread over H and W, but then block size selectetion,

849

# and stride calculation should be changed

850

if tens == cmd.ifm_tensor:

851

emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, out_shape[-2] - 1)

852

elif tens == cmd.ofm_tensor:

853

emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, out_shape[-2] - 1)

else:

assert False

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame^]

857

emit.cmd0_with_param(region_op, base_ptr_idx_map[tens.mem_type])

Jacob Bohlin

2020-04-27 10:27:25 +0200

[diff] [blame]

858

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

859

for idx, addr in enumerate(addresses):

if addr is None:

addresses[idx] = 0

emit.cmd1_with_offset(ptr_ops[0], addresses[0])

864

emit.cmd1_with_offset(ptr_ops[1], addresses[1])

865

emit.cmd1_with_offset(ptr_ops[2], addresses[2])

866

emit.cmd1_with_offset(ptr_ops[3], addresses[3])

867

868

strides = tens.get_strides()

869

emit.cmd1_with_offset(stride_ops[0], strides[1]) # stride between 16-byte channel blocks (C)

870

emit.cmd1_with_offset(stride_ops[2], strides[3]) # stride between horisontal values (W)

871

emit.cmd1_with_offset(stride_ops[1], strides[2]) # stride between vertical values (H)

872

873

if tens.format == TensorFormat.NHCWB16:

874

# Check that all BasePointer addresses are aligned to 16 bytes

875

assert (int(addresses[0]) % 16) == 0

876

assert (int(addresses[1]) % 16) == 0

877

assert (int(addresses[2]) % 16) == 0

878

assert (int(addresses[3]) % 16) == 0

879

880

ofm_dtype = cmd.ofm_tensor.dtype

881

assert ofm_dtype.type & BaseType.Int

882

prec = 0

883

if ofm_dtype.size_in_bits() == 8:

884

prec = 0

885

elif ofm_dtype.size_in_bits() == 16:

prec = 2

else:

assert 0

if ofm_dtype.type & BaseType.Signed:

prec += 1

if use_global_scale:

# Set global scale bit, as opposed to using per channel scale

895

prec |= 1 << 8

896

897

if cmd.ofm_tensor.format == TensorFormat.NHCWB16:

898

prec |= 1 << 6

899

900

prec |= rounding_mode.value << 14

901

902

emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)

prec = None

weight_bits = 8

if cmd.weight_tensor is not None:

907

weight_bits = cmd.weight_tensor.dtype.size_in_bits()

908

909

ifm_dtype = cmd.ifm_tensor.dtype

910

911

assert weight_bits == 8, "Unsupported weight bit depth"

912

assert ifm_dtype.size_in_bits() in {8, 16}

913

914

if ifm_dtype.size_in_bits() == 8:

915

if ifm_dtype.type & BaseType.Signed:

Diqing Zhong

2020-04-27 10:27:34 +0200

[diff] [blame]

916

prec = ifm_precision.S8

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

917

else:

Diqing Zhong

2020-04-27 10:27:34 +0200

[diff] [blame]

918

prec = ifm_precision.U8

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

919

elif ifm_dtype.size_in_bits() == 16:

920

if ifm_dtype.type & BaseType.Signed:

Diqing Zhong

2020-04-27 10:27:34 +0200

[diff] [blame]

921

prec = ifm_precision.S16

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

922

else:

Diqing Zhong

2020-04-27 10:27:34 +0200

[diff] [blame]

923

prec = ifm_precision.U16

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

924

925

ifm_prec = prec.value

926

ifm2_prec = ifm_prec

927

928

if cmd.ifm_tensor.format == TensorFormat.NHCWB16:

929

ifm_prec |= 1 << 6

930

931

ifm_prec |= op_to_scale << 8

932

933

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PRECISION, ifm_prec)

934

935

if cmd.ifm2_tensor is not None:

936

if cmd.ifm2_tensor.format == TensorFormat.NHCWB16:

937

ifm2_prec |= 1 << 6

938

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_PRECISION, ifm2_prec)

939

940

emit_wait_commands(cmd)

941

942

# Get op parameters

943

cur_ifm_block_depth = get_op_ifmofm_block_depth(arch, cmd)

944

cur_ofm_block = Block(ps.block_config[1], ps.block_config[0], ps.block_config[3])

945

cur_ofm_rect = get_op_ofm_rect(cmd)

946

cur_ifm_rect = get_op_ifm_rect(cmd)

Tim Hall