Blame - ethosu/vela/register_command_stream_generator.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

1

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

16

# Description:

17

# Register level (low-level) command stream generation for Ethos-U55. Takes a high-level command stream and generates

18

# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit

19

# stream suitable for interpretation by the Ethos-U55 processor.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

20

from collections import defaultdict

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

21

from collections import namedtuple

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

22

from enum import Enum

23

from enum import IntEnum

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

import numpy as np

from . import scaling

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

28

from .architecture_features import ArchitectureFeatures

29

from .architecture_features import Block

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

30

from .architecture_features import Rect

31

from .architecture_features import SharedBufferArea

32

from .architecture_features import SHRAMElements

33

from .data_type import BaseType

34

from .data_type import DataType

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame^]

35

from .debug_database import DebugDatabase

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

36

from .ethos_u55_regs.ethos_u55_regs import acc_format

37

from .ethos_u55_regs.ethos_u55_regs import activation

38

from .ethos_u55_regs.ethos_u55_regs import cmd0

39

from .ethos_u55_regs.ethos_u55_regs import cmd1

40

from .ethos_u55_regs.ethos_u55_regs import elementwise_mode

41

from .ethos_u55_regs.ethos_u55_regs import ifm_precision

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

42

from .ethos_u55_regs.ethos_u55_regs import pooling_mode

Jacob Bohlin

2020-05-20 09:03:40 +0200

[diff] [blame]

43

from .ethos_u55_regs.ethos_u55_regs import resampling_mode

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

44

from .ethos_u55_regs.ethos_u55_regs import rounding

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

45

from .high_level_command_stream import CommandType

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

46

from .numeric_util import clamp_sigmoid

47

from .numeric_util import clamp_tanh

Louis Verhaard

b2fb212

2020-06-04 15:51:24 +0200

[diff] [blame]

48

from .numeric_util import full_shape

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

49

from .numeric_util import quantise_float32

50

from .numeric_util import round_away_zero

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

51

from .numeric_util import round_up_to_int

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

52

from .operation import NpuBlockType

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

53

from .operation import Op

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame]

54

from .tensor import MemType

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

55

from .tensor import TensorBlockTraversal

56

from .tensor import TensorFormat

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

57

from .tensor import TensorPurpose

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

58

59

60

class RegisterMachine:

61

def __init__(self):

62

self.n_banks = 1

63

self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]

64

self.bank_idx = 0

65

66

def set_register(self, reg, value):

67

is_changed = self.registers[self.bank_idx][reg] != value

68

self.registers[self.bank_idx][reg] = value

69

# is_changed = True # force command

70

return is_changed

71

72

def switch_bank(self):

73

self.bank_idx = (self.bank_idx + 1) % self.n_banks

74

75

76

class CmdMode(IntEnum):

NoPayload = 0x0000

Payload32 = 0x4000

Mask = 0xC000

CmdOpMask = 0x03FF

class BasePointerIndex(IntEnum):

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame]

84

WeightTensor = 0 # base address index for the Weight tensor

85

ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena

86

ScratchFastTensor = 2 # base address for the Scratch_fast_tensor

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

87

Mem2Mem = (1 << 8) | (3 << 0) # base address slot for memory 2 memory transfer

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

88

89

90

# TODO: Replace with definitions from ethos_u55_regs

91

class IFM2Broadcast(IntEnum):

92

BroadcastHdim = 1 << 0

93

BroadcastWdim = 1 << 1

94

BroadcastCdim = 1 << 2

95

ReverseOperandOrder = 1 << 6

96

UseIFM2Scalar = 1 << 7

97

98

99

class CommandStreamEmitter:

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame^]

100

WORD_SIZE = 4

101

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

102

def __init__(self):

103

self.cmd_stream = []

104

self.reg_machine = [RegisterMachine(), RegisterMachine()]

105

self.last_absolute_wait = defaultdict(int)

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame^]

106

self.offset = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

107

108

def get_reg_machine(self, cmd):

109

if "DMA" in cmd.name:

110

return self.reg_machine[1]

111

else:

112

return self.reg_machine[0]

113

114

def size_in_bytes(self):

115

sz = 0

116

for cmd in self.cmd_stream:

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame^]

117

sz += len(cmd) * CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

return sz

def to_list(self):

return [elem for cmd in self.cmd_stream for elem in cmd]

122

123

def print_cmds(self):

124

print("Code: Command: Param: Payload:")

125

for words_for_one_command in self.cmd_stream:

126

code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits

127

param = words_for_one_command[0] >> 16 # higher 16 bits

128

129

payload_mode = CmdMode(code & CmdMode.Mask)

130

131

# code and command

132

s = " 0x%04x " % code

133

if payload_mode == CmdMode.NoPayload:

134

s += str(cmd0(code & CmdMode.CmdOpMask))

135

else:

136

s += str(cmd1(code & CmdMode.CmdOpMask))

s = s.ljust(40)

s += "%5d" % param

# payload

if payload_mode == CmdMode.Payload32:

143

s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])

else:

s += " -"

print(s)

def cmd0_with_param(self, cmd, param):

150

if isinstance(param, Enum):

151

param = int(param.value)

152

else:

153

param = int(param)

154

param = param & 0xFFFF

155

command = cmd.value | (param << 16)

156

if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):

157

return

158

159

# This is not a redundant command, actually write it

160

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame^]

161

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

162

163

def cmd1_with_offset(self, cmd, offset, param=0x0):

164

offset = int(offset) & 0xFFFFFFFFF

165

command = cmd.value | CmdMode.Payload32.value | (param << 16)

166

167

if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):

168

return

169

170

# This is not a redundant command, actually write it

171

self.cmd_stream.append((command, offset))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame^]

172

self.offset += CommandStreamEmitter.WORD_SIZE * 2

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

173

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

174

def cmd_wait(self, cmd, channel, outstanding_count):

175

param = (16 * channel) + outstanding_count

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

176

command = ((param & 0xFFFF) << 16) | cmd.value

177

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame^]

178

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

179

180

def cmd_do_operation(self, cmd, param=0):

181

param = int(param)

182

command = ((param & 0xFFFF) << 16) | cmd.value

183

184

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame^]

185

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

186

self.get_reg_machine(cmd).switch_bank()

187

188

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

189

Watermark = namedtuple("Watermark", ["npu", "dma"])

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

190

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

191

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

192

def get_cmd_wait_dependency(arch, cmd_stream, memory_accesses, cmd_index, watermark: Watermark):

193

cmd = cmd_stream[cmd_index]

194

cmd_access = memory_accesses[cmd]

195

index = cmd_index - 1

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

196

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

197

# NPU dependency tracking

198

npu_outstanding = -1

199

npu_ops = 0

200

npu_index = watermark.npu

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

201

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

202

# DMA dependency tracking

203

dma_outstanding = -1

204

dma_ops = 0

205

dma_index = watermark.dma

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

206

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

207

# Seek back in the command stream looking for NPU or DMA dependencies

208

# but only as far as the first dependency or the watermarks (dependencies

209

# before this point have been satisfied already).

210

# The watermark moves to after the latest element we must wait for, not

211

# the command that issues the wait.

212

# NPU->NPU dependency is handled via blockdep.

213

while (index >= npu_index) or (index >= dma_index):

214

prev_cmd = cmd_stream[index]

215

prev_access = memory_accesses[prev_cmd]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

216

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

217

# Check DMA consuming NPU output

218

if prev_cmd.cmdtype == CommandType.NpuStripe:

219

if index >= npu_index:

220

if (cmd.cmdtype == CommandType.DMA) and (npu_outstanding == -1) and prev_access.conflicts(cmd_access):

221

npu_outstanding = npu_ops

222

npu_ops = npu_ops + 1 # Count NPU ops in the pipeline

223

if npu_ops >= arch.max_outstanding_kernels:

224

npu_index = max(index + 1, npu_index)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

225

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

226

# Check NPU consuming DMA output

227

elif prev_cmd.cmdtype == CommandType.DMA:

228

if index >= dma_index:

229

if cmd.cmdtype == CommandType.NpuStripe:

230

if (dma_outstanding == -1) and prev_access.conflicts(cmd_access):

231

dma_outstanding = dma_ops

232

dma_ops = dma_ops + 1 # Count DMA ops in the pipeline

233

if dma_ops >= arch.max_outstanding_dma:

234

dma_index = max(index + 1, dma_index)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

235

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

236

index = index - 1

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

237

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

238

# Update DMA watermark if we didn't see any and the NPU pipeline is full

239

if (dma_ops == 0) and (npu_ops >= arch.max_outstanding_kernels):

240

dma_index = cmd_index

241

242

# Bring the search watermark forwards as we complete for those dependencies

243

watermark = Watermark(npu_index, dma_index)

244

outstanding = Watermark(npu_outstanding, dma_outstanding)

245

246

return watermark, outstanding

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

247

248

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

249

def has_prev_op_dependency(prev_cmd, cmd):

250

if prev_cmd is None:

251

return False

252

if (prev_cmd.cmdtype == cmd.cmdtype == CommandType.NpuStripe) and (prev_cmd.ps != cmd.ps):

Louis Verhaard

2020-08-05 16:11:29 +0200

[diff] [blame]

253

if prev_cmd.ofm_tensor.equivalent(cmd.ifm_tensor):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

254

return True

Tim Hall

9033795

2020-05-07 16:42:35 +0100

[diff] [blame]

255

elif cmd.ifm2_tensor is not None:

Louis Verhaard

2020-08-05 16:11:29 +0200

[diff] [blame]

256

return prev_cmd.ofm_tensor.equivalent(cmd.ifm2_tensor)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

return False

def get_op_ofm_rect(cmd):

Charles Xu

3e9c434

2020-04-22 08:31:43 +0200

[diff] [blame]

261

start = full_shape(4, cmd.ofm_box.start_coord, 0)

262

end = full_shape(4, cmd.ofm_box.end_coord, 1)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

263

return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1)

264

265

266

def get_op_ifm_rect(cmd):

Charles Xu

3e9c434

2020-04-22 08:31:43 +0200

[diff] [blame]

267

start = full_shape(4, cmd.ifm_box.start_coord, 0)

268

end = full_shape(4, cmd.ifm_box.end_coord, 1)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

269

return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1)

270

271

272

def get_op_ifmofm_block_depth(arch, cmd):

273

# Note: NOT equivalent to the normal ifm block depth calculation since

274

# it takes into account 'depthless' block operations by returning full

275

# depth

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

276

if cmd.ps.npu_block_type in (

277

NpuBlockType.ConvolutionDepthWise,

278

NpuBlockType.Pooling,

279

NpuBlockType.ElementWise,

280

NpuBlockType.ReduceSum,

281

):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

282

return cmd.ofm_box.get_size_shape()[-1]

283

284

return arch.calc_ifm_block_depth(cmd.ifm_box.get_size_shape()[-1], cmd.ifm_tensor.dtype.bits)

285

286

287

def get_op_padding_lt(cmd):

288

if cmd.ps.npu_block_type not in (

289

NpuBlockType.ConvolutionDepthWise,

290

NpuBlockType.Pooling,

291

NpuBlockType.ConvolutionMxN,

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

292

NpuBlockType.ReduceSum,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

):

return (0, 0)

explicit_padding = list(cmd.ps.primary_op.attrs["explicit_padding"]) # (top, left, bottom, right)

297

298

# Check if this is for horizontal ifm streaming

299

if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):

300

explicit_padding[0] = cmd.pad_top

301

explicit_padding[2] = cmd.pad_bottom

302

303

return (explicit_padding[1], explicit_padding[0])

304

305

Jacob Bohlin

e99b893

2020-07-13 16:01:51 +0200

[diff] [blame]

306

def ifm_ifm2_correct_order(ifm_shape, ifm2_shape):

307

if ifm_shape == []:

308

# Scalar needs to be in IFM2

309

return False

310

elif ifm2_shape == []:

311

return True

312

313

for ifm, ifm2 in zip(ifm_shape, ifm2_shape):

314

if ifm != ifm2 and ifm == 1:

315

# Broadcasted FM needs to be in IFM2

return False

return True

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

321

def generate_register_command_stream(nng, sg, arch, verbose=False):

322

emit = CommandStreamEmitter()

323

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame]

324

if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area:

325

base_ptr_idx_map = {

326

MemType.Permanent_NPU: BasePointerIndex.WeightTensor,

327

MemType.Permanent_CPU: BasePointerIndex.WeightTensor,

328

MemType.Scratch: BasePointerIndex.ScratchTensor,

329

MemType.Scratch_fast: BasePointerIndex.ScratchTensor,

}

else:

base_ptr_idx_map = {

MemType.Permanent_NPU: BasePointerIndex.WeightTensor,

334

MemType.Permanent_CPU: BasePointerIndex.WeightTensor,

335

MemType.Scratch: BasePointerIndex.ScratchTensor,

336

MemType.Scratch_fast: BasePointerIndex.ScratchFastTensor,

337

}

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

338

339

# Maps an AccumulatorType enum to the corresponding acc_format value

340

acc_format_map = {

341

SHRAMElements.Acc16: acc_format.FP_S5_10.value,

342

SHRAMElements.Acc32: acc_format.INT_32BIT.value,

343

SHRAMElements.Acc40: acc_format.INT_40BIT.value,

344

}

345

346

# Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE

347

elementwise_mode_map = {

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

348

Op.Mul: elementwise_mode.MUL.value,

349

Op.Add: elementwise_mode.ADD.value,

350

Op.Sub: elementwise_mode.SUB.value,

351

Op.Minimum: elementwise_mode.MIN.value,

352

Op.Maximum: elementwise_mode.MAX.value,

353

Op.LeakyRelu: elementwise_mode.LRELU.value,

354

Op.Abs: elementwise_mode.ABS.value,

355

Op.CLZ: elementwise_mode.CLZ.value,

356

Op.SHR: elementwise_mode.SHR.value,

357

Op.SHL: elementwise_mode.SHL.value,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

358

}

359

360

cmd_stream = []

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

361

memory_accesses = {}

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

362

for cmd in sg.high_level_command_stream:

363

if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default:

364

print("Warning: Skipping register command stream generation for", cmd.ps)

365

else:

366

cmd_stream.append(cmd)

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

367

memory_accesses[cmd] = cmd.get_memory_accesses()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

368

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

369

def emit_cmd_waits(cmd_waits):

370

if cmd_waits.npu >= 0:

371

emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)

372

373

if cmd_waits.dma >= 0:

374

emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

375

376

# Initialise operator dependency state

377

prev_ifm_rect = cur_ifm_rect = None

378

prev_ifm_block_depth = cur_ifm_block_depth = None

379

prev_ofm_rect = cur_ofm_rect = None

380

prev_ofm_block = cur_ofm_block = None

381

prev_kernel = cur_kernel = None

382

prev_cmd = None

383

Tim Hall

42e4189

2020-07-06 10:51:31 +0100

[diff] [blame]

384

if arch.is_yoda_system:

Jacob Bohlin

2020-07-09 11:16:30 +0200

[diff] [blame]

385

emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

386

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

387

dep_watermark = Watermark(0, 0)

388

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame^]

389

stream_id = DebugDatabase.add_stream(sg)

390

DebugDatabase.set_stream_offset(sg, 0) # Default to zero, can only set during file writing

391

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

392

for cmd_index, cmd in enumerate(cmd_stream):

393

dep_watermark, cmd_waits = get_cmd_wait_dependency(arch, cmd_stream, memory_accesses, cmd_index, dep_watermark)

394

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

395

if cmd.cmdtype == CommandType.DMA:

396

start_coord = cmd.box.start_coord

397

398

src_addr = cmd.in_tensor.address_for_coordinate(start_coord)

399

dst_addr = cmd.out_tensor.address_for_coordinate(start_coord)

400

401

if cmd.in_tensor.compressed_values is not None:

Andreas Nevalainen

897cc14

2020-10-28 15:42:08 +0100

[diff] [blame]

402

if cmd.out_tensor.purpose == TensorPurpose.FSBias:

403

sz = cmd.in_tensor.storage_size()

404

else:

405

stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord)

406

sz = cmd.in_tensor.size_of_compressed_stream(stream_index)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

407

else:

408

sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr

409

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame]

410

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, base_ptr_idx_map[cmd.in_tensor.mem_type])

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

411

emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, src_addr)

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

412

if cmd.out_tensor.purpose == TensorPurpose.LUT:

413

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, BasePointerIndex.Mem2Mem)

414

else:

415

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, base_ptr_idx_map[cmd.out_tensor.mem_type])

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame]

416

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

417

emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dst_addr)

418

emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, sz)

419

dma_channel = 0

420

mode = 0 # From external to external

421

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

422

emit_cmd_waits(cmd_waits)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

423

emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, dma_channel * 16 + mode)

424

425

elif cmd.cmdtype == CommandType.NpuStripe:

426

427

ps = cmd.ps

428

primary_op = ps.primary_op

429

npu_block_type = ps.npu_block_type

430

# Specifies if global scale from the NPU_SET_OFM_SCALE register should be used instead of per-channel scale

431

use_global_scale = False

432

# Specifies type of rounding to be used.

Tim Hall

d775e37

2020-08-28 18:33:38 +0100

[diff] [blame]

433

rounding_mode = (

434

rounding.NATURAL if primary_op.attrs.get("rounding_mode", "") == b"NATURAL" else rounding.TFL

435

)

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

436

if primary_op.type == Op.ResizeBilinear:

Dwight Lidman

3ec04ac

2020-04-30 11:54:48 +0200

[diff] [blame]

437

rounding_mode = rounding.TRUNCATE

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

438

fmf = primary_op.memory_function

439

faf = primary_op.activation

440

fused_quantize = any(op.type == Op.Quantize for op in ps.ops)

Louis Verhaard

2020-08-25 13:36:41 +0200

[diff] [blame]

441

# Force output scale, used in operations with fused LUT

442

# Note: with current LUT support, forced_ofm_quantization is always equal to cmd.ofm_tensor.quantization

443

# except when primary_op is AddAct + 0 (no-op) + LUT

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

444

forced_ofm_quantization = primary_op.forced_output_quantization

Louis Verhaard

2020-08-25 13:36:41 +0200

[diff] [blame]

445

ofm_quant = cmd.ofm_tensor.quantization

446

if forced_ofm_quantization is not None:

447

ofm_quant = forced_ofm_quantization

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

448

449

# Specifies which operand to apply scaling to in bitexact elementwise ADD/SUB

450

op_to_scale = 0

451

452

# Update state history

453

prev_ifm_rect = cur_ifm_rect

454

prev_ifm_block_depth = cur_ifm_block_depth

455

prev_ofm_rect = cur_ofm_rect

456

prev_ofm_block = cur_ofm_block

457

prev_kernel = cur_kernel

Tim Hall

4ed38bc

2020-10-20 18:54:20 +0100

[diff] [blame]

458

cur_kernel = ps.primary_op.kernel if ps.primary_op else None

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

459

460

block_config = ps.block_config

461

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config[0] - 1)

462

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config[1] - 1)

463

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config[3] - 1)

464

465

shared_buffer = ps.shared_buffer

466

467

if npu_block_type == NpuBlockType.ElementWise:

Jacob Bohlin

be733cf

2020-08-13 10:21:34 +0200

[diff] [blame]

468

ifm2_broadcast = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

469

Jacob Bohlin

bf61268

2020-08-13 09:37:02 +0200

[diff] [blame]

470

if cmd.ifm2_tensor and not ifm_ifm2_correct_order(cmd.ifm_tensor.shape, cmd.ifm2_tensor.shape):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

471

# The scalar has to be the ifm2 tensor so switch the ifms

472

cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor

473

cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box

474

475

# Set ReverseOperandOrder bit to IFM2_BROADCAST

476

ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder

477

478

# Calculate scales needed for arithmetic elementwise operators

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

479

if primary_op.type in set((Op.Add, Op.Mul, Op.Sub,)):

Fredrik Svedberg

0f98b36

2020-09-29 10:00:39 +0200

[diff] [blame]

480

input_scale = cmd.ifm_tensor.quantization.scale_f32 if cmd.ifm_tensor.quantization else None

481

input2_scale = cmd.ifm2_tensor.quantization.scale_f32 if cmd.ifm2_tensor.quantization else None

482

output_scale = ofm_quant.scale_f32 if ofm_quant else None

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

483

use_global_scale = True

484

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

485

if output_scale is not None and faf in (Op.Sigmoid, Op.Tanh):

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

486

output_scale = 1 / 0x3000

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

487

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

488

if primary_op.type == Op.Mul:

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

489

if None in (input_scale, input2_scale, output_scale):

ofm_scale = 1

shift = 0

else:

ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

494

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

495

else: # AddAct/SubAct

Charles Xu

2020-07-02 15:12:40 +0200

[diff] [blame]

496

# Force output scale same as the input scale for

Louis Verhaard

2020-08-25 13:36:41 +0200

[diff] [blame]

497

# resizebilinear 1x1 that is converted to add

Charles Xu

2020-07-02 15:12:40 +0200

[diff] [blame]

498

if "resizebilinear" in primary_op.attrs:

499

output_scale = input2_scale

500

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

501

if None in (input_scale, input2_scale, output_scale):

502

opa_scale = opb_scale = ofm_scale = 1

503

opa_shift = shift = 0

Fredrik Svedberg

597fd3f

2020-08-13 10:02:53 +0200

[diff] [blame]

504

ofm_scale, shift = primary_op.attrs.get("rescale", [1, 0])

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

505

elif input_scale == input2_scale:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

506

opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(

507

input_scale, input2_scale, output_scale

508

)

509

opa_shift = 0 # Unused for this case

510

else:

511

# Use advanced implementation only when input scales differ

512

bitdepth = cmd.ifm_tensor.dtype.bits

(

opa_scale,

opa_shift,

ofm_scale,

shift,

op_to_scale,

) = scaling.advanced_elementwise_add_sub_scale(

520

input_scale, input2_scale, output_scale, bitdepth

521

)

522

opb_scale = 0 # Unused for this case

523

if ifm2_broadcast & IFM2Broadcast.ReverseOperandOrder:

524

# If the operand order is reversed we also have to swap which operand is scaled

525

if op_to_scale == scaling.OperandToScale.OPa:

526

op_to_scale = scaling.OperandToScale.OPb

527

else:

528

op_to_scale = scaling.OperandToScale.OPa

529

530

emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)

531

emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)

532

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

533

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

534

elif primary_op.type in set((Op.LeakyRelu, Op.Abs,)):

Louis Verhaard

2020-08-25 13:36:41 +0200

[diff] [blame]

535

output_scale = ofm_quant.scale_f32

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

536

use_global_scale = True

537

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

538

if primary_op.type == Op.LeakyRelu:

Louis Verhaard

58520b9

2020-08-24 16:45:38 +0200

[diff] [blame]

539

output_scale = primary_op.attrs["alpha"]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

540

541

ofm_scale, shift = scaling.quantise_scale(output_scale)

542

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

543

else:

544

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

545

Louis Verhaard

2020-08-05 16:11:29 +0200

[diff] [blame]

546

# For elementwise set the required SHRAM to be equal to the total size of available SHRAM

547

uses_lut = primary_op.activation_lut is not None

548

shram_required = arch.available_shram_banks(uses_lut)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

549

emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)

550

551

# Acc buffers not needed so set AB_START to size of SHRAM

Louis Verhaard

2020-08-05 16:11:29 +0200

[diff] [blame]

552

emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

553

554

# Is not a unary operator

555

if cmd.ifm2_tensor is not None:

556

if cmd.ifm2_tensor.shape == []:

557

# IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST

558

ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar

559

else:

560

ifm_box_shape = cmd.ifm_box.get_size_shape()

561

ifm2_box_shape = cmd.ifm2_box.get_size_shape()

562

563

if len(cmd.ifm_tensor.shape) > 1 and ifm_box_shape[1] != ifm2_box_shape[1]:

564

# Broadcast in 'H' dimension

565

assert cmd.ifm2_tensor.shape[1] == 1

566

ifm2_broadcast |= IFM2Broadcast.BroadcastHdim

567

568

if len(cmd.ifm_tensor.shape) > 2 and ifm_box_shape[2] != ifm2_box_shape[2]:

569

# Broadcast in 'W' dimension

570

assert cmd.ifm2_tensor.shape[2] == 1

571

ifm2_broadcast |= IFM2Broadcast.BroadcastWdim

572

573

if len(cmd.ifm_tensor.shape) > 3 and ifm_box_shape[3] != ifm2_box_shape[3]:

574

# Broadcast in 'C' dimension

575

assert cmd.ifm2_tensor.shape[3] == 1

576

ifm2_broadcast |= IFM2Broadcast.BroadcastCdim

577

578

# Set IFM2_IB_START to the latter half of the IB space

579

ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]

580

emit.cmd0_with_param(

Tim Hall

4ed38bc

2020-10-20 18:54:20 +0100

[diff] [blame]

581

cmd0.NPU_SET_IFM2_IB_START,

582

(shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

583

)

584

585

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)

586

587

else:

588

emit.cmd0_with_param(

589

cmd0.NPU_SET_IFM_IB_END,

590

shared_buffer.bank_locations[SharedBufferArea.IFM]

591

+ shared_buffer.banks_required[SharedBufferArea.IFM],

592

)

593

emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])

594

595

emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])

596

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

597

if primary_op.type == Op.ResizeBilinear:

Dwight Lidman

3ec04ac

2020-04-30 11:54:48 +0200

[diff] [blame]

598

# perform nearest neighbor upscale

Jacob Bohlin

2020-05-20 09:03:40 +0200

[diff] [blame]

599

emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.NEAREST)

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

600

elif primary_op.type == Op.Conv2DBackpropInputSwitchedBias:

Jacob Bohlin

2020-05-20 09:03:40 +0200

[diff] [blame]

601

# perform insert zero upscale

602

emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.TRANSPOSE)

Dwight Lidman

3ec04ac

2020-04-30 11:54:48 +0200

[diff] [blame]

603

else:

Jacob Bohlin

2020-05-20 09:03:40 +0200

[diff] [blame]

604

emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.NONE)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

605

606

if npu_block_type in set(

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

607

(

608

NpuBlockType.ConvolutionMxN,

609

NpuBlockType.ConvolutionDepthWise,

610

NpuBlockType.Pooling,

611

NpuBlockType.ReduceSum,

612

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

613

):

614

# Set up padding

615

explicit_padding = list(primary_op.attrs["explicit_padding"]) # (top, left, bottom, right)

616

617

# Check if this is for horizontal ifm streaming

618

if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):

619

explicit_padding[0] = cmd.pad_top

620

explicit_padding[2] = cmd.pad_bottom

621

622

# Indexing from end since a 1x1 Avgpool might have been added with non 4-dimensional input/output,

623

# because of activation function needed to be fused.

624

if cmd.ifm_box.start_coord[-2] > 0:

625

explicit_padding[1] = 0

626

if cmd.ifm_box.end_coord[-2] < cmd.ifm_tensor.shape[-2]:

627

explicit_padding[3] = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

628

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, explicit_padding[0])

629

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, explicit_padding[1])

630

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, explicit_padding[2])

631

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, explicit_padding[3])

632

Dwight Lidman

0538a77

2020-05-06 14:09:17 +0200

[diff] [blame]

633

# set kernel x stride low bit

634

stride = primary_op.attrs["strides"][2] - 1 & 1

635

# set kernel y stride low bit

636

stride |= (primary_op.attrs["strides"][1] - 1 & 1) << 1

637

# set kernel x stride extension bits

638

stride |= (primary_op.attrs["strides"][2] - 1 >> 1) << 6

639

# set kernel y stride extension bits

640

stride |= (primary_op.attrs["strides"][1] - 1 >> 1) << 9

641

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

642

if npu_block_type in set((NpuBlockType.Pooling, NpuBlockType.ReduceSum)):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

643

k_height, k_width = primary_op.attrs["ksize"][1:3]

644

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, k_height - 1)

645

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, k_width - 1)

646

647

valid_padding = sum(explicit_padding) == 0

648

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

649

if primary_op.type in set((Op.AvgPool, Op.ResizeBilinear, Op.ReduceSum)) and valid_padding:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

650

# For valid padding vela has to output scaling values

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

651

if faf == Op.Sigmoid or faf == Op.Tanh:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

652

rescale = 0x3000 * cmd.ifm_tensor.quantization.scale_f32

Fredrik Svedberg

2020-05-19 10:43:01 +0200

[diff] [blame]

653

if cmd.ifm_tensor.dtype == DataType.int16:

Charles Xu

f899231

2020-08-18 08:41:54 +0200

[diff] [blame]

654

# Calculate scale and shift for the output scale of 1/(3*4096)

655

shift = 0

656

max_rescale = np.iinfo(np.int16).max / 2

657

while rescale <= max_rescale and shift <= 30:

658

shift += 1

659

rescale *= 2

660

scale = int(rescale)

Fredrik Svedberg

2020-05-19 10:43:01 +0200

[diff] [blame]

661

else:

Charles Xu

f899231

2020-08-18 08:41:54 +0200

[diff] [blame]

662

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

663

scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)

Fredrik Svedberg

2020-05-19 10:43:01 +0200

[diff] [blame]

664

scale = int(round_away_zero(scale * rescale))

Jacob Bohlin

9fbc491

2020-06-29 11:58:50 +0200

[diff] [blame]

665

elif fused_quantize:

666

# Quantize op requires different scaling

667

ifm_scale_f64 = np.double(cmd.ifm_tensor.quantization.scale_f32)

Louis Verhaard

2020-08-25 13:36:41 +0200

[diff] [blame]

668

ofm_scale_f64 = np.double(ofm_quant.scale_f32)

Jacob Bohlin

9fbc491

2020-06-29 11:58:50 +0200

[diff] [blame]

669

scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

670

elif primary_op.type == Op.ResizeBilinear and "rescale" in primary_op.attrs:

Charles Xu

87c1350

2020-08-06 12:17:26 +0200

[diff] [blame]

671

rescale = primary_op.attrs["rescale"]

672

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

673

scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)

674

scale = int(round_away_zero(scale * rescale))

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

675

else:

676

# In case avg pool fused with concat or other memory operation, rescaling might be needed.

677

# k_height == k_width == 1 is allways true in this case

678

# Normally the scale is maximised, to get maximum precision, which means that

679

# if rescale != 1, scale need to consider the number of bits needed for rescaling

Louis Verhaard

2020-08-25 13:36:41 +0200

[diff] [blame]

680

if None not in (ofm_quant.scale_f32, cmd.ifm_tensor.quantization.scale_f32,):

681

rescale = cmd.ifm_tensor.quantization.scale_f32 / ofm_quant.scale_f32

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

682

rescale_bits = 0

683

if k_height == k_width == 1:

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

684

if fmf == Op.ConcatSliceWrite:

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

685

rounding_mode = rounding.NATURAL

686

if rescale > 1:

687

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

688

elif rescale < 1:

689

rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)

690

scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)

691

scale = int(round_away_zero(scale * rescale))

692

else:

693

scale = 1

694

shift = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

695

696

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)

697

# Valid-padded average pool should use the global scale from

698

# NPU_SET_OFM_SCALE register, which is set above.

699

use_global_scale = True

700

701

else: # Convolution

702

assert cmd.weight_tensor.block_traversal != TensorBlockTraversal.Default

Fredrik Svedberg

d67c0aa

2020-03-30 13:15:28 +0200

[diff] [blame]

703

# Reduced precision quantization and natural rounding used for int16

704

if cmd.ifm_tensor.dtype == DataType.int16:

705

rounding_mode = rounding.NATURAL

Louis Verhaard

b2fb212

2020-06-04 15:51:24 +0200

[diff] [blame]

706

stride |= (cur_kernel.dilation.y - 1) << 4

707

stride |= (cur_kernel.dilation.x - 1) << 3

708

emit.cmd0_with_param(

709

cmd0.NPU_SET_KERNEL_HEIGHT_M1, cur_kernel.dilation.y * (cmd.weight_tensor.shape[0] - 1)

710

)

711

emit.cmd0_with_param(

712

cmd0.NPU_SET_KERNEL_WIDTH_M1, cur_kernel.dilation.x * (cmd.weight_tensor.shape[1] - 1)

713

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

714

if cmd.weight_tensor.block_traversal == TensorBlockTraversal.PartKernelFirst:

715

# Part-kernel-first weight ordering

716

assert npu_block_type == NpuBlockType.ConvolutionMxN

717

stride |= 1 << 2

718

719

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)

720

721

elif npu_block_type in set((NpuBlockType.VectorProduct,)):

722

# Vector product is implemented using a 1x1 convolution so need

723

# to setup the appropriate padding and kernel info

724

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, 0)

725

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, 0)

726

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, 0)

727

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, 0)

728

729

# kernel stride reg = 0 means stride(1,1) + depth first weight

730

# order + dilation(0,0) + kernel_split_size=8

731

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, 0)

732

733

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, 0)

734

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, 0)

735

736

if npu_block_type in set(

737

(NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)

738

):

739

# Emit Weight base address commands, only maps the area required for

740

# this command's weights from the larger tensor.

741

stream_index = cmd.weight_tensor.compressed_stream_index_from_coord(cmd.weight_box.start_coord)

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

742

weight_substream_offsets = cmd.weight_tensor.compressed_values_substream_offsets[stream_index]

Jacob Bohlin

2020-07-09 11:16:30 +0200

[diff] [blame]

743

substreams = len(weight_substream_offsets) - 1 # Offset list must terminate with full stream length

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

744

745

# Extract weight substream offsets and calculate their lengths

746

assert len(weight_substream_offsets) > 1 and (weight_substream_offsets[0] == 0)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

747

weight_addr = cmd.weight_tensor.address_for_coordinate(cmd.weight_box.start_coord)

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

748

Tim Hall

2020-06-25 16:55:02 +0100

[diff] [blame]

749

# Set weights sources for active and present cores

Jacob Bohlin

2020-07-09 11:16:30 +0200

[diff] [blame]

750

for core, param in enumerate(

751

[

752

(cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),

753

(cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),

754

]

755

):

Tim Hall

2020-06-25 16:55:02 +0100

[diff] [blame]

756

if core < substreams:

Jacob Bohlin

2020-07-09 11:16:30 +0200

[diff] [blame]

757

emit.cmd1_with_offset(param[0], weight_addr + weight_substream_offsets[core])

758

emit.cmd1_with_offset(

759

param[1], weight_substream_offsets[core + 1] - weight_substream_offsets[core]

760

)

Tim Hall

2020-06-25 16:55:02 +0100

[diff] [blame]

761

elif core < arch.ncores:

762

emit.cmd1_with_offset(param[0], weight_addr)

763

emit.cmd1_with_offset(param[1], 0)

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

764

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame]

765

weight_region = base_ptr_idx_map[cmd.weight_tensor.mem_type]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

766

emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weight_region)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

767

768

# Emit Scale & Bias base address commands, with length matching the amount required by

769

# the weight tensors.

770

if cmd.scale_tensor is not None:

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

771

scale_substream_offsets = cmd.scale_tensor.compressed_values_substream_offsets[stream_index]

Jacob Bohlin

2020-07-09 11:16:30 +0200

[diff] [blame]

772

substreams = len(scale_substream_offsets) - 1 # Offset list must terminate with full stream length

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

773

774

# Extract scale substream offsets and calculate their lengths

775

assert len(scale_substream_offsets) > 1 and (scale_substream_offsets[0] == 0)

Jacob Bohlin

2020-07-09 11:16:30 +0200

[diff] [blame]

776

scale_addr = cmd.scale_tensor.address_for_coordinate(cmd.weight_box.start_coord[-1:])

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

777

Tim Hall

2020-06-25 16:55:02 +0100

[diff] [blame]

778

# Set scale sources for active and present cores

Jacob Bohlin

2020-07-09 11:16:30 +0200

[diff] [blame]

779

for core, param in enumerate(

780

[

781

(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH),

782

(cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH),

783

]

784

):

Tim Hall

2020-06-25 16:55:02 +0100

[diff] [blame]

785

if core < substreams:

Jacob Bohlin

2020-07-09 11:16:30 +0200

[diff] [blame]

786

emit.cmd1_with_offset(param[0], scale_addr + scale_substream_offsets[core])

787

emit.cmd1_with_offset(

788

param[1], scale_substream_offsets[core + 1] - scale_substream_offsets[core]

789

)

Tim Hall

2020-06-25 16:55:02 +0100

[diff] [blame]

790

elif core < arch.ncores:

791

emit.cmd1_with_offset(param[0], scale_addr)

792

emit.cmd1_with_offset(param[1], 0)

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

793

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

794

# Emit base address for NPU to access scale & bias data

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame]

795

scale_region = base_ptr_idx_map[cmd.scale_tensor.mem_type]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

796

emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, scale_region)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

797

Fredrik Svedberg

0f98b36

2020-09-29 10:00:39 +0200

[diff] [blame]

798

ofm_quant_qmin = ofm_quant.quant_min if ofm_quant else np.iinfo(np.int16).min

799

ofm_quant_qmax = ofm_quant.quant_max if ofm_quant else np.iinfo(np.int16).max

800

ifm_min = cmd.ifm_tensor.quantization.min if cmd.ifm_tensor.quantization else np.iinfo(np.int16).min

801

ifm_max = cmd.ifm_tensor.quantization.max if cmd.ifm_tensor.quantization else np.iinfo(np.int16).max

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

802

803

# Emit commands for any fused activation function

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

804

if faf is None:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

805

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)

806

# Even if no activation function, values need to be set to override previous values

807

faf_min = ofm_quant_qmin

808

faf_max = ofm_quant_qmax

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

809

elif faf == Op.Relu:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

810

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)

811

faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)

812

faf_max = ofm_quant_qmax

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

813

elif faf == Op.Relu6:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

814

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)

815

faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)

816

faf_max = quantise_float32(6.0, ofm_quant.scale_f32, ofm_quant.zero_point)

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

817

elif faf == Op.ReluN1To1:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

818

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)

819

faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point)

820

faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

821

elif faf == Op.Tanh:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

822

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.TANH)

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

823

if primary_op.type in set((Op.AvgPool, Op.ResizeBilinear)):

Fredrik Svedberg

2020-05-19 10:43:01 +0200

[diff] [blame]

824

faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point)

825

faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)

826

else:

827

faf_min = quantise_float32(clamp_tanh(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)

828

faf_max = quantise_float32(clamp_tanh(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

829

elif faf == Op.Sigmoid:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

830

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.SIGMOID)

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

831

if primary_op.type in set((Op.AvgPool, Op.ResizeBilinear)):

Fredrik Svedberg

2020-05-19 10:43:01 +0200

[diff] [blame]

832

faf_min = quantise_float32(0, ofm_quant.scale_f32, ofm_quant.zero_point)

833

faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)

834

else:

835

faf_min = quantise_float32(clamp_sigmoid(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)

836

faf_max = quantise_float32(clamp_sigmoid(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

837

elif faf == Op.LUT:

Louis Verhaard

2020-08-05 16:11:29 +0200

[diff] [blame]

838

lut_index = int(activation.LUT_START.value) + primary_op.attrs.get("lut_index", -1)

839

assert activation.LUT_START.value <= lut_index <= activation.LUT_END.value, "LUT index out of range."

Fredrik Svedberg

597fd3f

2020-08-13 10:02:53 +0200

[diff] [blame]

840

if cmd.ofm_tensor.dtype == DataType.int32:

Fredrik Svedberg

1575b94

2020-08-18 13:19:18 +0200

[diff] [blame]

841

lut_index |= 3 << 12 # Force I8 range

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

842

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, lut_index)

843

faf_min = ofm_quant_qmin

844

faf_max = ofm_quant_qmax

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

845

else:

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

846

raise Exception("Unsupported fused_activation_function = " + faf.name)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

847

848

# Activation range needs to be set based upon the quantisation range and the fused activation range

849

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, max(ofm_quant_qmin, faf_min))

850

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, min(ofm_quant_qmax, faf_max))

851

852

out_shape = cmd.ofm_box.get_size_shape()

853

if len(out_shape) >= 4:

854

emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, out_shape[-3] - 1)

855

else:

856

emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, 0)

857

if len(out_shape) >= 2:

858

emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, out_shape[-2] - 1)

859

else:

860

emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, 0)

861

emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, out_shape[-1] - 1)

862

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

863

if npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum)):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

864

in_shape = cmd.ifm_box.get_size_shape()

865

emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, in_shape[-1] - 1)

866

else:

867

emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, out_shape[-1] - 1)

868

Jacob Bohlin

2020-04-27 10:27:25 +0200

[diff] [blame]

869

for tens, box, region_op, ptr_ops, stride_ops, zero_point_op in (

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

870

(

871

cmd.ifm_tensor,

872

cmd.ifm_box,

Jacob Bohlin

2020-04-27 10:27:25 +0200

[diff] [blame]

873

cmd0.NPU_SET_IFM_REGION,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

874

(cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3),

875

(cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X),

876

cmd0.NPU_SET_IFM_ZERO_POINT,

),

(

cmd.ifm2_tensor,

cmd.ifm2_box,

Jacob Bohlin

2020-04-27 10:27:25 +0200

[diff] [blame]

881

cmd0.NPU_SET_IFM2_REGION,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

882

(

883

cmd1.NPU_SET_IFM2_BASE0,

884

cmd1.NPU_SET_IFM2_BASE1,

885

cmd1.NPU_SET_IFM2_BASE2,

886

cmd1.NPU_SET_IFM2_BASE3,

887

),

888

(cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X),

889

cmd0.NPU_SET_IFM2_ZERO_POINT,

),

(

cmd.ofm_tensor,

cmd.ofm_box,

Jacob Bohlin

2020-04-27 10:27:25 +0200

[diff] [blame]

894

cmd0.NPU_SET_OFM_REGION,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

895

(cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3),

896

(cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X),

897

cmd0.NPU_SET_OFM_ZERO_POINT,

),

):

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

901

if tens is None:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

902

continue

903

Louis Verhaard

98a3499

2020-09-01 10:39:04 +0200

[diff] [blame]

904

need_zero_point = (

905

(faf is not None and forced_ofm_quantization is None)

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

906

or (fmf == Op.ConcatSliceWrite)

Louis Verhaard

98a3499

2020-09-01 10:39:04 +0200

[diff] [blame]

907

or fused_quantize

908

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

909

if (

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

910

(primary_op.type in set((Op.AvgPool, Op.ResizeBilinear, Op.CLZ, Op.SHL)) and not need_zero_point)

Fredrik Svedberg

237d72d

2020-08-28 18:12:28 +0200

[diff] [blame]

911

or (

912

tens.dtype == DataType.int32

913

and zero_point_op in (cmd0.NPU_SET_IFM_ZERO_POINT, cmd0.NPU_SET_IFM2_ZERO_POINT)

914

)

915

or tens.quantization is None

916

):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

917

# Actual integer operation, just set scale to 1 and zero point to 0

918

emit.cmd0_with_param(zero_point_op, 0)

919

else:

920

assert tens.quantization.zero_point is not None, "need an actual zero point set"

Louis Verhaard

2020-08-25 13:36:41 +0200

[diff] [blame]

921

if cmd0.NPU_SET_OFM_ZERO_POINT == zero_point_op and forced_ofm_quantization is not None:

922

zero_point = forced_ofm_quantization.zero_point

923

elif (

Charles Xu

2020-07-02 15:12:40 +0200

[diff] [blame]

924

"resizebilinear" in primary_op.attrs

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

925

and primary_op.type == Op.Add

Charles Xu

2020-07-02 15:12:40 +0200

[diff] [blame]

926

and cmd0.NPU_SET_OFM_ZERO_POINT == zero_point_op

927

):

928

# Force output zero point same as the input zero point

Louis Verhaard

2020-08-25 13:36:41 +0200

[diff] [blame]

929

# for resizebilinear 1x1 that is converted to add

Charles Xu

2020-07-02 15:12:40 +0200

[diff] [blame]

930

zero_point = cmd.ifm2_tensor.quantization.zero_point

931

else:

932

zero_point = tens.quantization.zero_point

933

emit.cmd0_with_param(zero_point_op, int(zero_point))

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

934

935

if tens.shape == []:

936

# Empty shape, elementwise constant

Louis Verhaard

c88a96f

2020-06-10 09:04:33 +0200

[diff] [blame]

937

ifm2_scalar = tens.quant_values

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

938

assert ifm2_scalar.size == 1

Louis Verhaard

c88a96f

2020-06-10 09:04:33 +0200

[diff] [blame]

939

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, int(ifm2_scalar.item(0)))

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

940

continue

941

942

height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer(

943

box.start_coord, box.end_coord

944

)

945

if npu_block_type != NpuBlockType.VectorProduct:

946

if tens == cmd.ifm_tensor:

947

emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT0_M1, height_0 - 1)

948

emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT1_M1, height_1 - 1)

949

emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, width_0 - 1)

950

elif tens == cmd.ofm_tensor:

951

emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT0_M1, height_0 - 1)

952

emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT1_M1, height_1 - 1)

953

emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, width_0 - 1)

Louis Verhaard

0cf06c7

2020-05-12 08:31:05 +0200

[diff] [blame]

954

if tens == cmd.ifm2_tensor:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

955

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT0_M1, height_0 - 1)

956

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT1_M1, height_1 - 1)

957

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_WIDTH0_M1, width_0 - 1)

958

else:

959

if len(out_shape) == 2:

Patrik Gustavsson

cb33704

2020-09-16 14:55:40 +0200

[diff] [blame]

960

assert out_shape[0] == 1

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

961

if tens == cmd.ifm_tensor:

Patrik Gustavsson

cb33704

2020-09-16 14:55:40 +0200

[diff] [blame]

962

emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, 0)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

963

elif tens == cmd.ofm_tensor:

Patrik Gustavsson

cb33704

2020-09-16 14:55:40 +0200

[diff] [blame]

964

emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, 0)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

else:

assert False

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame]

968

emit.cmd0_with_param(region_op, base_ptr_idx_map[tens.mem_type])

Jacob Bohlin

2020-04-27 10:27:25 +0200

[diff] [blame]

969

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

970

for idx, addr in enumerate(addresses):

if addr is None:

addresses[idx] = 0

emit.cmd1_with_offset(ptr_ops[0], addresses[0])

975

emit.cmd1_with_offset(ptr_ops[1], addresses[1])

976

emit.cmd1_with_offset(ptr_ops[2], addresses[2])

977

emit.cmd1_with_offset(ptr_ops[3], addresses[3])

978

979

strides = tens.get_strides()

980

emit.cmd1_with_offset(stride_ops[0], strides[1]) # stride between 16-byte channel blocks (C)

981

emit.cmd1_with_offset(stride_ops[2], strides[3]) # stride between horisontal values (W)

982

emit.cmd1_with_offset(stride_ops[1], strides[2]) # stride between vertical values (H)

983

984

if tens.format == TensorFormat.NHCWB16:

985

# Check that all BasePointer addresses are aligned to 16 bytes

986

assert (int(addresses[0]) % 16) == 0

987

assert (int(addresses[1]) % 16) == 0

988

assert (int(addresses[2]) % 16) == 0

989

assert (int(addresses[3]) % 16) == 0

990

991

ofm_dtype = cmd.ofm_tensor.dtype

992

assert ofm_dtype.type & BaseType.Int

993

prec = 0

994

if ofm_dtype.size_in_bits() == 8:

995

prec = 0

996

elif ofm_dtype.size_in_bits() == 16:

997

prec = 2

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

998

elif ofm_dtype.size_in_bits() == 32:

999

prec = 4

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

else:

assert 0

if ofm_dtype.type & BaseType.Signed:

prec += 1

if use_global_scale:

# Set global scale bit, as opposed to using per channel scale

1008

prec |= 1 << 8

1009

1010

if cmd.ofm_tensor.format == TensorFormat.NHCWB16:

1011

prec |= 1 << 6

1012

1013

prec |= rounding_mode.value << 14

1014

1015

emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)

prec = None

weight_bits = 8

if cmd.weight_tensor is not None:

1020

weight_bits = cmd.weight_tensor.dtype.size_in_bits()

1021

1022

ifm_dtype = cmd.ifm_tensor.dtype

1023

1024

assert weight_bits == 8, "Unsupported weight bit depth"

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

1025

assert (

1026

ifm_dtype.size_in_bits() in {8, 16}

1027

or ifm_dtype.size_in_bits() == 32

1028

and npu_block_type in (NpuBlockType.ElementWise, NpuBlockType.ReduceSum)

1029

), "Unsupported ifm bit depth"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1030

1031

if ifm_dtype.size_in_bits() == 8:

1032

if ifm_dtype.type & BaseType.Signed:

Diqing Zhong

2020-04-27 10:27:34 +0200

[diff] [blame]

1033

prec = ifm_precision.S8

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1034

else:

Diqing Zhong

2020-04-27 10:27:34 +0200

[diff] [blame]

1035

prec = ifm_precision.U8

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1036

elif ifm_dtype.size_in_bits() == 16:

1037

if ifm_dtype.type & BaseType.Signed:

Diqing Zhong

2020-04-27 10:27:34 +0200

[diff] [blame]

1038

prec = ifm_precision.S16

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1039

else:

Diqing Zhong

2020-04-27 10:27:34 +0200

[diff] [blame]

1040

prec = ifm_precision.U16

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

1041

elif ifm_dtype == DataType.int32:

1042

prec = ifm_precision.S32

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1043

1044

ifm_prec = prec.value

1045

ifm2_prec = ifm_prec

1046

1047

if cmd.ifm_tensor.format == TensorFormat.NHCWB16:

1048

ifm_prec |= 1 << 6

1049

1050

ifm_prec |= op_to_scale << 8

1051

1052

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PRECISION, ifm_prec)

1053

1054

if cmd.ifm2_tensor is not None:

1055

if cmd.ifm2_tensor.format == TensorFormat.NHCWB16:

1056

ifm2_prec |= 1 << 6

1057

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_PRECISION, ifm2_prec)

1058

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1059

# Get op parameters

1060

cur_ifm_block_depth = get_op_ifmofm_block_depth(arch, cmd)

1061

cur_ofm_block = Block(ps.block_config[1], ps.block_config[0], ps.block_config[3])

1062

cur_ofm_rect = get_op_ofm_rect(cmd)

1063

cur_ifm_rect = get_op_ifm_rect(cmd)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1064

cur_padLT = get_op_padding_lt(cmd)

1065

if (prev_kernel is not None) and (cur_kernel is not None) and has_prev_op_dependency(prev_cmd, cmd):

1066

if cmd.ifm_tensor.shape == prev_cmd.ofm_tensor.shape:

1067

blockdep = arch.calc_block_dep(

1068

prev_ifm_rect,

1069

prev_ofm_rect,

1070

prev_ifm_block_depth,

prev_ofm_block,

prev_kernel,

cur_ifm_rect,

cur_ofm_rect,

cur_ifm_block_depth,

cur_ofm_block,

cur_kernel,

cur_padLT,

)

else:

blockdep = 0

else:

blockdep = ArchitectureFeatures.MAX_BLOCKDEP

1084

1085

# Set between every op (dependent or not)

1086

blockdep = min(blockdep, arch.max_blockdep)

1087

emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)

1088

prev_cmd = cmd

1089

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

1090

emit_cmd_waits(cmd_waits)

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame^]

1091

DebugDatabase.add_command(stream_id, emit.offset, primary_op)

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

1092

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1093

if npu_block_type == NpuBlockType.ConvolutionMxN:

1094

emit.cmd_do_operation(cmd0.NPU_OP_CONV)

1095

elif npu_block_type == NpuBlockType.ConvolutionDepthWise:

1096

emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)

1097

elif npu_block_type == NpuBlockType.VectorProduct:

1098

# Vector product is implemented using a 1x1 convolution

1099

emit.cmd_do_operation(cmd0.NPU_OP_CONV)

1100

elif npu_block_type == NpuBlockType.Pooling:

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

1101

param = pooling_mode.MAX.value if primary_op.type.is_maxpool_op() else pooling_mode.AVERAGE.value

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1102

emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=param)

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

1103

elif npu_block_type == NpuBlockType.ReduceSum:

1104

emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_mode.REDUCE_SUM.value)

Tim Hall