Blame - ethosu/vela/npu_performance.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

1

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

16

# Description:

17

# NPU performance estimation functions to estimate performance of a Pass and CascadedPass. Uses a model that takes the

18

# maximum of the 'cycles required for bandwidth' and 'cycles required for computing'.

19

#

20

# Called during scheduling to evaluate different proposals, as well as post-scheduling to provide a final performance

21

# estimate.

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

22

from enum import auto

23

from enum import IntEnum

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

24

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

25

import numpy as np

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

26

27

from . import numeric_util

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

28

from .architecture_features import Accelerator

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

29

from .architecture_features import Block

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

30

from .data_type import DataType

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

31

from .nn_graph import PassPlacement

32

from .nn_graph import SchedulerRewrite

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

33

from .operation import NpuBlockType

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

34

from .operation import Op

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

35

from .shared_buffer_allocation import is_acc_40bits_used

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

36

from .tensor import MemArea

37

from .tensor import shape_num_elements

38

from .tensor import TensorBlockTraversal

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

39

from .tensor import TensorFormat

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

40

from .tensor import TensorPurpose

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

41

42

43

def rolling_buffer_dims_from_passes(arch, ps1, block_config_ps1, ps2, block_config_ps2):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

44

ofm_block = Block(block_config_ps2[-3], block_config_ps2[-4], block_config_ps2[-1])

Tim Hall

4ed38bc

2020-10-20 18:54:20 +0100

[diff] [blame]

45

kernel = ps2.primary_op.kernel

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

46

47

if ps2.npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct)):

Louis Verhaard

93dc553

2020-06-07 12:40:18 +0200

[diff] [blame]

48

op = ps2.primary_op

Louis Verhaard

aee5d75

2020-09-30 09:01:52 +0200

[diff] [blame]

49

ifm_block_depth = arch.calc_ifm_block_depth(op.ifm.shape[-1], op.ifm.dtype.size_in_bits())

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

50

else:

51

ifm_block_depth = block_config_ps2[-1]

52

Louis Verhaard

93dc553

2020-06-07 12:40:18 +0200

[diff] [blame]

53

ifm_block = arch.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, arch.ofm_block_max)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

54

55

# The performed height calculation is for worst case

56

height = numeric_util.round_up(ifm_block.height + block_config_ps1[0], block_config_ps1[0])

57

width = ifm_block.width

Louis Verhaard

93dc553

2020-06-07 12:40:18 +0200

[diff] [blame]

58

return [height, width]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

59

60

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

61

class PassCycles(IntEnum):

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

62

Npu = 0

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

Cpu = auto()

SramAccess = auto()

DramAccess = auto()

OnChipFlashAccess = auto()

67

OffChipFlashAccess = auto()

68

Total = auto()

69

Size = auto()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

70

71

def display_name(self):

72

return (

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

73

"NPU",

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

74

"CPU",

75

"SRAM Access",

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

76

"DRAM Access",

77

"On-chip Flash Access",

78

"Off-chip Flash Access",

"Total",

"Size",

)[self.value]

def identifier_name(self):

84

return (

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

85

"npu",

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

86

"cpu",

87

"sram_access",

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

88

"dram_access",

89

"on_chip_flash_access",

90

"off_chip_flash_access",

"total",

"size",

)[self.value]

@staticmethod

def all():

return (

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

98

PassCycles.Npu,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

99

PassCycles.Cpu,

100

PassCycles.SramAccess,

101

PassCycles.DramAccess,

102

PassCycles.OnChipFlashAccess,

103

PassCycles.OffChipFlashAccess,

PassCycles.Total,

)

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

108

class MacCount(IntEnum):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

109

NeuralNetworkMacs = 0

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

110

HardwareMacs = auto()

111

Size = auto()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

112

113

def display_name(self):

114

return ("Neural Network Macs", "Hardware Macs", "Size")[self.value]

115

116

def identifier_name(self):

117

return ("nn_macs", "hardware_macs", "size")[self.value]

@staticmethod

def all():

return (MacCount.NeuralNetworkMacs, MacCount.HardwareMacs)

122

123

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

124

class BandwidthDirection(IntEnum):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

125

Read = 0

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

126

Write = auto()

127

Size = auto()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

128

129

def display_name(self):

130

return self.name

131

132

def identifier_name(self):

133

return self.name.lower()

@staticmethod

def all():

return (BandwidthDirection.Read, BandwidthDirection.Write)

138

139

140

def make_bandwidth_array():

141

return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size))

142

143

144

def make_macs_array():

145

return np.zeros(MacCount.Size, np.int)

146

147

148

def make_cycles_array():

149

return np.zeros(PassCycles.Size)

150

151

152

def make_metrics_arrays():

153

return (make_bandwidth_array(), make_macs_array(), make_cycles_array())

154

155

156

def get_n_blocks_and_area(

157

ifm_brick_size, ifm_height_width, orig_skirt, clamped_skirt, block_config, min_block_size, strides

158

):

159

160

ifm_block_config = (block_config[0] * strides[1], block_config[1] * strides[2])

n_normal_blocks = []

remainder_size = []

for i in range(2):

non_skirt_dim = ifm_height_width[i] - orig_skirt[i] - orig_skirt[2 + i]

166

n_blocks = non_skirt_dim // ifm_block_config[i]

167

n_normal_blocks.append(n_blocks)

168

remainder_dim = numeric_util.round_up(

169

((non_skirt_dim - n_blocks * ifm_block_config[i] - 1) // strides[i + 1]) + 1, min_block_size[i]

170

)

171

remainder_size.append(remainder_dim)

172

173

# this will actually calculate reads into the edge padding.

174

175

# there are four cases in total, handling the edges that will not fill a complete block.

# 0000000001

# 0000000001

# 0000000001

# 0000000001

# 0000000001

# 0000000001

# 2222222223

total_blocks = 0

total_area = 0

block_setup = (

(n_normal_blocks[0] * n_normal_blocks[1], block_config),

189

(1 * n_normal_blocks[1], (remainder_size[0], block_config[1])),

190

(n_normal_blocks[0] * 1, (block_config[0], remainder_size[1])),

191

(1 * 1, remainder_size),

192

)

193

194

for n_blocks, block_size in block_setup:

195

if block_size[0] == 0 or block_size[1] == 0:

continue

read_dims = [0, 0]

for i in range(2):

read_dims[i] = (

numeric_util.round_up(clamped_skirt[i], ifm_brick_size[i + 1])

201

+ block_size[i] * strides[i + 1]

202

+ numeric_util.round_up(clamped_skirt[2 + i], ifm_brick_size[i + 1])

203

)

204

assert n_blocks >= 0

205

total_blocks += n_blocks

206

total_area += n_blocks * read_dims[0] * read_dims[1]

207

assert total_blocks >= 1

208

return total_blocks, total_area, block_setup

209

210

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

211

def get_ifm_block_depth(npu_block_type, ifm_depth, ifm_elemwidth, block_traversal, ofm_blk_depth):

212

ifm_blk_depth = ofm_blk_depth

213

214

if npu_block_type == NpuBlockType.ConvolutionMxN or npu_block_type == NpuBlockType.ReduceSum:

215

if ifm_elemwidth == 16 or block_traversal == TensorBlockTraversal.PartKernelFirst:

216

ifm_blk_depth = 16

217

elif ifm_elemwidth == 8:

ifm_blk_depth = 32

else:

ifm_blk_depth = 8

return min(ifm_depth, ifm_blk_depth)

223

224

225

def estimate_output_cycles(

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

226

arch, npu_block_type, primary_op, num_elems, ifm_tensor, ofm_tensor, ifm2_tensor, use_acc_40bits=False

227

):

Louis Verhaard

e8a5a78

2020-11-02 18:04:27 +0100

[diff] [blame^]

228

faf = None if primary_op.activation is None else primary_op.activation.op_type

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

229

if npu_block_type == NpuBlockType.ElementWise and ifm_tensor.dtype == DataType.int32:

230

if ifm2_tensor is None:

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

231

# Unary op

232

output_perf_index = 0

233

else:

234

# Binary op

235

output_perf_index = 1

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

236

elif primary_op.type == Op.Mul and ofm_tensor.dtype == DataType.int32:

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

237

output_perf_index = 2

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

238

elif primary_op.type == Op.Mul or (

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

239

npu_block_type

240

in (

241

NpuBlockType.ConvolutionMxN,

242

NpuBlockType.ConvolutionDepthWise,

243

NpuBlockType.Pooling,

244

NpuBlockType.ReduceSum,

245

NpuBlockType.VectorProduct,

246

)

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

247

and use_acc_40bits

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

248

):

249

output_perf_index = 3

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

250

elif primary_op.type in (Op.Add, Op.Sub):

251

input_scale = ifm_tensor.quantization.scale_f32

252

input2_scale = ifm2_tensor.quantization.scale_f32

253

output_scale = ofm_tensor.quantization.scale_f32

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

254

255

if "resizebilinear" in primary_op.attrs:

256

output_scale = input2_scale

257

258

if None in (input_scale, input2_scale, output_scale) or input_scale == input2_scale:

259

# Simple Add/Sub

260

output_perf_index = 4

261

else:

262

# Advanced Add/Sub

263

output_perf_index = 5

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

264

elif primary_op.type.is_maxpool_op():

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

265

output_perf_index = 6

266

else:

267

output_perf_index = 7

268

269

if faf in (Op.Sigmoid, Op.Tanh, Op.LUT):

270

activation_perf_index = 0

271

elif faf in (Op.Relu, Op.Relu6, Op.ReluN1To1):

272

activation_perf_index = 1

273

else:

274

activation_perf_index = 2

275

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

276

cycle_per_elem = max(

277

arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index]

278

)

279

return num_elems * cycle_per_elem

280

281

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

282

def estimate_conv_pooling_cycles(

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

283

arch, npu_block_type, primary_op, block_config: Block, block_traversal, kernel_dims, ifm_tensor, ofm_tensor

284

):

Diqing Zhong

e5204a6

2020-10-13 11:42:37 +0200

[diff] [blame]

285

ofm_ublock = Block(arch.config.ofm_ublock.width, arch.config.ofm_ublock.height, arch.config.ofm_ublock.depth)

286

ifm_tens_shape = numeric_util.full_shape(4, ifm_tensor.shape, 1)

287

ofm_tens_shape = numeric_util.full_shape(4, ofm_tensor.shape, 1)

288

289

if (

290

arch.config.ofm_ublock.height == 2

291

and npu_block_type

292

in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)

293

and ofm_tens_shape[1] == 1

294

# Optimisation only applies for even width tensors

295

and ofm_tens_shape[2] % 2 == 0

296

and kernel_dims[0] == 1

297

):

298

ofm_ublock.width = 4

299

ofm_ublock.height = 1

300

block_config.height = 1

301

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

302

num_ublk = (

Diqing Zhong

e5204a6

2020-10-13 11:42:37 +0200

[diff] [blame]

303

numeric_util.round_up_divide(block_config.width, ofm_ublock.width)

304

* (block_config.height // ofm_ublock.height)

305

* (block_config.depth // ofm_ublock.depth)

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

)

num_ofm_blk = 0

total_cycles = 0

num_elems_blk = block_config.width * block_config.height * block_config.depth

Diqing Zhong

e5204a6

2020-10-13 11:42:37 +0200

[diff] [blame]

310

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

311

use_acc_40bits = is_acc_40bits_used(npu_block_type, ifm_tensor, ofm_tensor)

312

313

sub_kernel_limits = arch.sub_kernel_limits[npu_block_type]

314

n_sub_kernels_y = numeric_util.round_up_divide(kernel_dims[0], sub_kernel_limits[0])

315

n_sub_kernels_x = numeric_util.round_up_divide(kernel_dims[1], sub_kernel_limits[1])

316

sub_kernel_x = [

317

min((kernel_dims[1] - i * sub_kernel_limits[1]), sub_kernel_limits[1]) for i in range(n_sub_kernels_x)

318

]

319

sub_kernel_y = [

320

min((kernel_dims[0] - i * sub_kernel_limits[0]), sub_kernel_limits[0]) for i in range(n_sub_kernels_y)

321

]

322

sub_kernel_size = (x * y for y in sub_kernel_y for x in sub_kernel_x)

323

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

324

ifm_blk_depth = get_ifm_block_depth(

325

npu_block_type, ifm_tens_shape[3], ifm_tensor.dtype.size_in_bits(), block_traversal, block_config.depth

326

)

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

327

cycles_dpu_blk = 0

328

329

for num_kernel_elems in sub_kernel_size:

330

if npu_block_type == NpuBlockType.Pooling:

331

cycles = max(4, num_kernel_elems) * num_ublk

332

if ifm_tensor.dtype.size_in_bits() == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32:

333

cycles *= 2

334

elif npu_block_type == NpuBlockType.ConvolutionDepthWise:

335

cycles = 4 * numeric_util.round_up_divide(num_kernel_elems, 4) * num_ublk

336

if ifm_tensor.dtype.size_in_bits() == 16:

337

cycles *= 2

338

elif (

339

(npu_block_type == NpuBlockType.ConvolutionMxN and block_traversal != TensorBlockTraversal.PartKernelFirst)

340

or npu_block_type == NpuBlockType.VectorProduct

341

or npu_block_type == NpuBlockType.ReduceSum

342

):

343

cycles = 4 * num_kernel_elems * num_ublk * numeric_util.round_up_divide(ifm_tens_shape[3], ifm_blk_depth)

344

else:

345

assert block_traversal == TensorBlockTraversal.PartKernelFirst

346

divider = 2 if ifm_tensor.dtype.size_in_bits() == 16 else 4

347

cycles = 4 * (

348

numeric_util.round_up_divide(num_kernel_elems, divider)

349

* numeric_util.round_up_divide(ifm_blk_depth, 8)

350

* num_ublk

351

* numeric_util.round_up_divide(ifm_tens_shape[3], ifm_blk_depth)

352

)

353

cycles_dpu_blk += cycles

354

355

cycles_dpu_blk /= arch.ncores

356

357

num_ofm_blk = (

358

numeric_util.round_up_divide(ofm_tens_shape[1], block_config.height)

359

* numeric_util.round_up_divide(ofm_tens_shape[2], block_config.width)

360

* numeric_util.round_up_divide(ofm_tens_shape[3], block_config.depth)

361

)

362

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

363

cycles_output_blk = estimate_output_cycles(

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

364

arch, npu_block_type, primary_op, num_elems_blk, ifm_tensor, ofm_tensor, None, use_acc_40bits

365

)

366

367

if cycles_dpu_blk > cycles_output_blk:

368

total_cycles = cycles_dpu_blk * num_ofm_blk + cycles_output_blk

369

else:

370

total_cycles = cycles_output_blk * num_ofm_blk + cycles_dpu_blk

return total_cycles

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

375

def estimate_memory_bandwidth(arch, mem_area, direction, tensor, block_size: Block, replace_bw=None):

376

if tensor.format not in (TensorFormat.NHWC, TensorFormat.NHCWB16):

377

return tensor.bandwidth() if replace_bw is None else replace_bw

378

379

# Estimate memory transfer efficiency by calculating the burst length

380

# this is related to data format, block shape, and tensor shape, etc.

381

max_burst_len = 32 if mem_area == MemArea.Sram else 128

382

burst_len = 0

383

elem_size = tensor.dtype.size_in_bytes()

384

is_ifm = direction == BandwidthDirection.Read

385

tens = tensor.clone()

386

if not tens.avoid_NHCWB16:

387

tens.set_format(TensorFormat.NHCWB16, arch)

388

389

if tens.format == TensorFormat.NHCWB16:

390

if tens.get_strides()[1] == block_size.depth:

391

burst_len = elem_size * block_size.depth * block_size.width

392

elif is_ifm:

393

burst_len = 16 * elem_size * block_size.width

394

else:

395

burst_len = 16 * elem_size * block_size.width * arch.ncores

396

else:

397

assert tens.format == TensorFormat.NHWC

398

if is_ifm:

399

if tens.get_strides()[3] == block_size.depth:

400

burst_len = elem_size * block_size.depth * block_size.width

401

else:

402

burst_len = elem_size * block_size.depth

403

else:

404

if block_size.depth <= 16 and tens.get_strides()[3] == block_size.depth:

405

burst_len = elem_size * block_size.depth * block_size.width

406

else:

407

burst_len = min(64, 16 * elem_size * arch.ncores, block_size.depth * elem_size)

408

409

burst_len = min(max_burst_len, burst_len)

410

bw = tens.bandwidth() if replace_bw is None else replace_bw

411

412

return bw * (max_burst_len / burst_len)

413

414

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

415

def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], force_outputs_to_fast_storage=False):

416

if block_config is None:

417

block_config = ps.block_config

418

bws = make_bandwidth_array()

419

macs = make_macs_array()

420

cycles = make_cycles_array()

421

blocks = 0

422

ifm_read_multiple = 1

423

weight_read_multiple = 0

424

425

if ps.placement in set((PassPlacement.MemoryOnly, PassPlacement.StartupInit)):

426

return bws, macs, cycles, blocks, ifm_read_multiple, weight_read_multiple # nothing real happening in this pass

427

428

min_block_size = arch.min_block_sizes[ps.npu_block_type]

429

430

skirt = (0, 0, 0, 0)

431

explicit_padding = (0, 0, 0, 0)

432

primary_op = ps.primary_op

433

replacement_read_bws = {}

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

434

ofm_block = Block(block_config[1], block_config[0], block_config[3])

435

ifm_block = Block(block_config[1], block_config[0], block_config[3])

436

Charles Xu

b02c8d9

2020-06-25 16:05:25 +0200

[diff] [blame]

437

if ps.placement == PassPlacement.Cpu:

438

cycles[PassCycles.Cpu] = arch.cpu_cycle_estimate(ps.ops[0])

439

elif primary_op:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

440

skirt = primary_op.attrs.get("skirt", skirt)

441

explicit_padding = primary_op.attrs.get("explicit_padding", explicit_padding)

Louis Verhaard

aee5d75

2020-09-30 09:01:52 +0200

[diff] [blame]

442

assert primary_op.type.npu_block_type == ps.npu_block_type

443

npu_block_type = primary_op.type.npu_block_type

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

444

block_traversal = TensorBlockTraversal.Default

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

445

446

ifm_tensor, _, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

447

ifm_tensor_shape = numeric_util.full_shape(4, ifm_tensor.shape, 1)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

448

Tim Hall

c30f495

2020-06-15 20:47:35 +0100

[diff] [blame]

449

if npu_block_type in set(

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

450

(

451

NpuBlockType.ConvolutionMxN,

452

NpuBlockType.ConvolutionDepthWise,

453

NpuBlockType.Pooling,

454

NpuBlockType.ReduceSum,

455

)

Tim Hall

c30f495

2020-06-15 20:47:35 +0100

[diff] [blame]

456

):

Charles Xu

2020-04-22 08:31:43 +0200

[diff] [blame]

457

# extent the ifm to full dimension

458

ifm_tensor_brick_size = tuple(numeric_util.full_shape(4, list(ifm_tensor.brick_size), 1))

Charles Xu

2020-04-22 08:31:43 +0200

[diff] [blame]

459

ifm_tensor_bandwidth_shape = numeric_util.full_shape(4, ifm_tensor.bandwidth_shape, 1)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

460

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

461

batch_size = ifm_tensor_shape[0]

Charles Xu

2020-04-22 08:31:43 +0200

[diff] [blame]

462

ifm_depth = ifm_tensor_bandwidth_shape[3]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

463

464

# add in padding

465

ifm_tensor_shape[1] += explicit_padding[0] + explicit_padding[2] # height += top and bottom

466

ifm_tensor_shape[2] += explicit_padding[1] + explicit_padding[3] # width += left and right

467

468

strides = primary_op.attrs["strides"]

469

if npu_block_type != NpuBlockType.Pooling:

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

470

if npu_block_type == NpuBlockType.ReduceSum:

471

block_traversal = TensorBlockTraversal.DepthFirst

472

weight_tensor_shape = [1, 1, ifm_tensor.shape[3], ofm_tensor.shape[3]]

473

weight_tensor_bandwidth_shape = [0] * 4

474

weight_tensor_element_size = 0

475

weight_tensor_bandwidth_compression_scale = 0.0

476

else:

477

block_traversal = weight_tensor.block_traversal

478

weight_tensor_shape = weight_tensor.shape

479

weight_tensor_bandwidth_shape = weight_tensor.bandwidth_shape

480

weight_tensor_element_size = weight_tensor.element_size()

481

weight_tensor_bandwidth_compression_scale = weight_tensor.bandwidth_compression_scale

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

482

nn_ops = (

483

int(ofm_tensor.shape[0])

484

* int(ofm_tensor.shape[1])

485

* int(ofm_tensor.shape[2])

486

* int(weight_tensor_shape[0])

487

* int(weight_tensor_shape[1])

488

* int(weight_tensor_shape[2])

489

* int(weight_tensor_shape[3])

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

490

)

491

else:

492

weight_tensor_shape = [

493

primary_op.attrs["ksize"][1],

494

primary_op.attrs["ksize"][2],

1,

ifm_tensor_shape[3],

]

weight_tensor_bandwidth_shape = weight_tensor_shape

499

weight_tensor_element_size = 0

500

weight_tensor_bandwidth_compression_scale = 0.0

501

nn_ops = 0 # pooling doesn't count as NN ops

502

503

kernel_dims = weight_tensor_shape[:2]

504

505

sub_kernel_limits = arch.sub_kernel_limits[npu_block_type]

506

# count the sub kernels; the IFM block needs to be refetched for each of them

507

n_sub_kernels_y = numeric_util.round_up_divide(kernel_dims[0], sub_kernel_limits[0])

508

n_sub_kernels_x = numeric_util.round_up_divide(kernel_dims[1], sub_kernel_limits[1])

509

n_sub_kernels = n_sub_kernels_y * n_sub_kernels_x

510

511

clamped_skirt = list(skirt)

512

clamped_skirt[2] = min(clamped_skirt[2], sub_kernel_limits[0] - 1 - clamped_skirt[0])

513

clamped_skirt[3] = min(clamped_skirt[3], sub_kernel_limits[1] - 1 - clamped_skirt[1])

514

n_blocks, area, block_setup = get_n_blocks_and_area(

Charles Xu

2020-04-22 08:31:43 +0200

[diff] [blame]

515

ifm_tensor_brick_size,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

516

ifm_tensor_shape[1:3],

skirt,

clamped_skirt,

block_config,

min_block_size,

strides,

)

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

524

blocks = n_blocks * numeric_util.round_up_divide(weight_tensor_shape[3], ofm_block.depth)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

525

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

526

n_weight_stages = numeric_util.round_up_divide(weight_tensor_bandwidth_shape[3], ofm_block.depth)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

527

if npu_block_type == NpuBlockType.ConvolutionDepthWise or npu_block_type == NpuBlockType.Pooling:

528

n_weight_stages = 1 # force to no reread

ifm_tensor_bw = (

n_sub_kernels

* batch_size

* area

* ifm_depth

* n_weight_stages

* ifm_tensor.element_size()

537

* ifm_tensor.bandwidth_compression_scale

538

)

539

replacement_read_bws[ifm_tensor] = ifm_tensor_bw

540

ifm_read_multiple = n_weight_stages

541

542

replacement_read_bws[weight_tensor] = (

543

batch_size

544

* shape_num_elements(weight_tensor_bandwidth_shape)

545

* weight_tensor_element_size

546

* weight_tensor_bandwidth_compression_scale

547

* n_blocks

548

) # read once per block and batch

549

weight_read_multiple = n_blocks

550

551

n_kernel_xy = kernel_dims[0] * kernel_dims[1]

552

n_input_channels_at_a_time = block_config[2]

553

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

554

if npu_block_type == NpuBlockType.Pooling or block_traversal in set(

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

555

(TensorBlockTraversal.PartKernelFirst, TensorBlockTraversal.DepthWise)

556

):

557

n_input_channels_at_a_time = numeric_util.round_up_divide(n_input_channels_at_a_time, 4)

558

n_kernel_xy = max(

559

n_kernel_xy, 4

560

) # need at least 4, as this is the minimum duty cycle for secondary accumulator writes

561

if weight_tensor is not None:

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

562

n_kernel_xy = numeric_util.round_up(n_kernel_xy, 4) # weights need to be read in blocks of 4

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

563

564

num_mac_ops = 0

565

for n_blocks_for_size, block_size in block_setup:

num_mac_ops += (

batch_size

* n_blocks_for_size

* block_size[0]

* block_size[1]

* numeric_util.round_up(weight_tensor_shape[2], n_input_channels_at_a_time)

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

572

* numeric_util.round_up(weight_tensor_shape[3], ofm_block.depth)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

* n_kernel_xy

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

576

macs[MacCount.NeuralNetworkMacs] += nn_ops

577

macs[MacCount.HardwareMacs] += num_mac_ops

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

578

cycles[PassCycles.Npu] = estimate_conv_pooling_cycles(

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

579

arch, npu_block_type, primary_op, ofm_block, block_traversal, kernel_dims, ifm_tensor, ofm_tensor,

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

580

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

581

elif npu_block_type == NpuBlockType.VectorProduct:

582

nn_macs = (

583

ifm_tensor.shape[0]

584

* numeric_util.round_up(weight_tensor.shape[-2], block_config[2])

585

* numeric_util.round_up(weight_tensor.shape[-1], block_config[3])

586

)

587

num_mac_ops = nn_macs

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

588

block_traversal = weight_tensor.block_traversal

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

589

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

590

cycles[PassCycles.Npu] = estimate_conv_pooling_cycles(

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

591

arch, npu_block_type, primary_op, ofm_block, block_traversal, [1, 1], ifm_tensor, ofm_tensor,

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

592

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

593

macs[MacCount.NeuralNetworkMacs] += nn_macs

594

macs[MacCount.HardwareMacs] += num_mac_ops

595

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

596

blocks = 1 * numeric_util.round_up_divide(weight_tensor.shape[-1], ofm_block.depth)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

597

598

non_zero_fraction = 1.0

599

if ifm_tensor.values is not None:

600

nz_vector = np.amax(ifm_tensor.values != 0, axis=0) # max across batch axis

601

non_zero_fraction = np.average(nz_vector)

602

603

replacement_read_bws[ifm_tensor] = ifm_tensor.bandwidth()

604

replacement_read_bws[weight_tensor] = weight_tensor.bandwidth() * non_zero_fraction

605

ifm_read_multiple = 1

606

weight_read_multiple = non_zero_fraction

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

607

elif npu_block_type == NpuBlockType.ElementWise:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

608

# Work out how many elements we have and calculate performance.

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

609

cycles[PassCycles.Npu] = estimate_output_cycles(

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

610

arch, npu_block_type, primary_op, ofm_tensor.elements(), ps.ifm_tensor, ps.ofm_tensor, ps.ifm2_tensor

611

)

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

612

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

613

ifm_block_depth = get_ifm_block_depth(

614

npu_block_type, ifm_tensor_shape[3], ifm_tensor.dtype.size_in_bits(), block_traversal, ofm_block.depth

615

)

616

ifm_block = arch.get_ifm_block_size(ifm_block_depth, ofm_block, primary_op.kernel)

617

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

618

prev_npu_pass = next((npu_ps for npu_ps in ps.dag_predecessors if npu_ps.placement is PassPlacement.Npu), None)

619

if prev_npu_pass is None:

620

# cycles for DMA ops in first pass

621

dma_ops = (op for op in ps.ops if op.type == Op.DMA)

622

for dma_op in dma_ops:

623

mem_area = dma_op.attrs["source"]

624

for tens in dma_op.inputs:

625

cycles[PassCycles.Npu] += tens.storage_size() / arch.memory_bandwidths_per_cycle[mem_area]

626

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

627

# apply the desired rewrites

628

for rewrite_op, tens, _, _, _, ps_to_rewrite in rewrite_list:

629

if ps != ps_to_rewrite:

630

continue

631

if rewrite_op == SchedulerRewrite.Nop:

632

pass # these are fine, no bandwidth changes

633

elif rewrite_op in (SchedulerRewrite.ChangeTensorSubPurpose,):

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

634

if tens.purpose == TensorPurpose.FeatureMap:

635

bw = estimate_memory_bandwidth(

636

arch,

637

arch.fast_storage_mem_area,

638

BandwidthDirection.Read,

639

tens,

640

ifm_block,

641

replacement_read_bws[tens],

642

)

643

else:

644

bw = replacement_read_bws[tens]

645

bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Read] += bw

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

646

replacement_read_bws[tens] = 0

647

648

for tens in ps.outputs:

649

if force_outputs_to_fast_storage:

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

650

bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Write] += estimate_memory_bandwidth(

651

arch, arch.fast_storage_mem_area, BandwidthDirection.Write, tens, ofm_block

652

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

653

else:

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

654

bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += estimate_memory_bandwidth(

655

arch, tens.mem_area, BandwidthDirection.Write, tens, ofm_block

656

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

657

658

for tens in ps.intermediates:

659

bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()

660

661

if tens in replacement_read_bws:

662

bw = replacement_read_bws[tens]

663

else:

664

bw = tens.bandwidth()

665

666

bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw

667

668

for tens in ps.inputs:

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

669

bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += estimate_memory_bandwidth(

670

arch, tens.mem_area, BandwidthDirection.Read, tens, ifm_block, replacement_read_bws.get(tens)

671

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

672

673

# quick build access counts for only current pass, even though these aren't the final numbers

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

674

update_summary_cycles(arch, bws, cycles)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

675

676

return bws, macs, cycles, blocks, ifm_read_multiple, weight_read_multiple

677

678

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

679

def update_summary_cycles(arch, bws, cycles):

680

cycles[PassCycles.SramAccess] = np.sum(bws[MemArea.Sram]) / arch.memory_bandwidths_per_cycle[MemArea.Sram]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

681

cycles[PassCycles.DramAccess] = np.sum(bws[MemArea.Dram]) / arch.memory_bandwidths_per_cycle[MemArea.Dram]

682

cycles[PassCycles.OnChipFlashAccess] = (

683

np.sum(bws[MemArea.OnChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OnChipFlash]

684

)

685

cycles[PassCycles.OffChipFlashAccess] = (

686

np.sum(bws[MemArea.OffChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OffChipFlash]

687

)

688

689

cycles[PassCycles.Total] = np.max(cycles[: PassCycles.Total])

return cycles

def collate_stats_for_cascaded_pass(arch, bws, macs, cycles):

694

return bws, macs, cycles

695

696

697

def performance_for_cascaded_pass(arch, cps):

698

total_bws = make_bandwidth_array()

699

total_macs = make_macs_array()

700

total_cycles = make_cycles_array()

701

702

for ps in cps.passes:

703

bws, macs, cycles, blocks, _, _ = performance_metrics_for_pass(arch, ps)

ps.bandwidths = bws

ps.macs = macs

ps.cycles = cycles

ps.n_blocks = blocks

total_bws += bws

total_macs += macs

total_cycles += cycles

711

712

bws, macs, cycles = collate_stats_for_cascaded_pass(arch, total_bws, total_macs, total_cycles)

cps.bandwidths = bws

cps.macs = macs

cps.cycles = cycles

return bws, macs, cycles

717

718

719

def calc_performance_for_network(nng, arch):

720

total_bws = make_bandwidth_array()

721

total_macs = np.zeros(MacCount.Size)

722

total_cycles = np.zeros(PassCycles.Size)

723

724

for sg in nng.subgraphs:

725

for cps in sg.cascaded_passes:

726

bws, macs, cycles = performance_for_cascaded_pass(arch, cps)

727

total_bws += bws

728

total_macs += macs

729

total_cycles += cycles

Tim Hall