Blame - ethosu/vela/npu_performance.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

1

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

16

# Description:

17

# NPU performance estimation functions to estimate performance of a Pass and CascadedPass. Uses a model that takes the

18

# maximum of the 'cycles required for bandwidth' and 'cycles required for computing'.

19

#

20

# Called during scheduling to evaluate different proposals, as well as post-scheduling to provide a final performance

21

# estimate.

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

22

from enum import auto

23

from enum import IntEnum

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

24

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

25

import numpy as np

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

26

27

from . import numeric_util

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

28

from .architecture_features import Accelerator

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

29

from .architecture_features import Block

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

30

from .data_type import DataType

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

31

from .nn_graph import PassPlacement

32

from .nn_graph import SchedulerRewrite

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

33

from .operation import NpuBlockType

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

34

from .operation import Op

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

35

from .shared_buffer_allocation import is_acc_40bits_used

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

36

from .tensor import MemArea

37

from .tensor import shape_num_elements

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

38

from .tensor import Tensor

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

39

from .tensor import TensorBlockTraversal

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

40

from .tensor import TensorFormat

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

41

from .tensor import TensorPurpose

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

42

43

44

def rolling_buffer_dims_from_passes(arch, ps1, block_config_ps1, ps2, block_config_ps2):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

45

ofm_block = Block(block_config_ps2[-3], block_config_ps2[-4], block_config_ps2[-1])

Tim Hall

4ed38bc

2020-10-20 18:54:20 +0100

[diff] [blame]

46

kernel = ps2.primary_op.kernel

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

47

Michael McGeagh

2020-12-02 12:39:03 +0000

[diff] [blame^]

48

if ps2.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct):

Louis Verhaard

93dc553

2020-06-07 12:40:18 +0200

[diff] [blame]

49

op = ps2.primary_op

Louis Verhaard

aee5d75

2020-09-30 09:01:52 +0200

[diff] [blame]

50

ifm_block_depth = arch.calc_ifm_block_depth(op.ifm.shape[-1], op.ifm.dtype.size_in_bits())

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

51

else:

52

ifm_block_depth = block_config_ps2[-1]

53

Louis Verhaard

93dc553

2020-06-07 12:40:18 +0200

[diff] [blame]

54

ifm_block = arch.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, arch.ofm_block_max)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

55

56

# The performed height calculation is for worst case

57

height = numeric_util.round_up(ifm_block.height + block_config_ps1[0], block_config_ps1[0])

58

width = ifm_block.width

Louis Verhaard

93dc553

2020-06-07 12:40:18 +0200

[diff] [blame]

59

return [height, width]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

60

61

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

62

class PassCycles(IntEnum):

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

63

Npu = 0

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

64

SramAccess = auto()

65

DramAccess = auto()

66

OnChipFlashAccess = auto()

67

OffChipFlashAccess = auto()

68

Total = auto()

69

Size = auto()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

70

71

def display_name(self):

Tim Hall

1bd531d

2020-11-01 20:59:36 +0000

[diff] [blame]

72

return ("NPU", "SRAM Access", "DRAM Access", "On-chip Flash Access", "Off-chip Flash Access", "Total", "Size",)[

73

self.value

74

]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

75

76

def identifier_name(self):

Tim Hall

1bd531d

2020-11-01 20:59:36 +0000

[diff] [blame]

77

return ("npu", "sram_access", "dram_access", "on_chip_flash_access", "off_chip_flash_access", "total", "size",)[

78

self.value

79

]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

@staticmethod

def all():

return (

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

84

PassCycles.Npu,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

85

PassCycles.SramAccess,

86

PassCycles.DramAccess,

87

PassCycles.OnChipFlashAccess,

88

PassCycles.OffChipFlashAccess,

PassCycles.Total,

)

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

93

class MacCount(IntEnum):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

94

NeuralNetworkMacs = 0

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

95

HardwareMacs = auto()

96

Size = auto()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

97

98

def display_name(self):

99

return ("Neural Network Macs", "Hardware Macs", "Size")[self.value]

100

101

def identifier_name(self):

102

return ("nn_macs", "hardware_macs", "size")[self.value]

@staticmethod

def all():

return (MacCount.NeuralNetworkMacs, MacCount.HardwareMacs)

107

108

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

109

class BandwidthDirection(IntEnum):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

110

Read = 0

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

111

Write = auto()

112

Size = auto()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

113

114

def display_name(self):

115

return self.name

116

117

def identifier_name(self):

118

return self.name.lower()

@staticmethod

def all():

return (BandwidthDirection.Read, BandwidthDirection.Write)

123

124

125

def make_bandwidth_array():

126

return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size))

127

128

129

def make_macs_array():

130

return np.zeros(MacCount.Size, np.int)

131

132

133

def make_cycles_array():

134

return np.zeros(PassCycles.Size)

135

136

137

def make_metrics_arrays():

138

return (make_bandwidth_array(), make_macs_array(), make_cycles_array())

139

140

141

def get_n_blocks_and_area(

142

ifm_brick_size, ifm_height_width, orig_skirt, clamped_skirt, block_config, min_block_size, strides

143

):

144

145

ifm_block_config = (block_config[0] * strides[1], block_config[1] * strides[2])

n_normal_blocks = []

remainder_size = []

for i in range(2):

non_skirt_dim = ifm_height_width[i] - orig_skirt[i] - orig_skirt[2 + i]

151

n_blocks = non_skirt_dim // ifm_block_config[i]

152

n_normal_blocks.append(n_blocks)

153

remainder_dim = numeric_util.round_up(

154

((non_skirt_dim - n_blocks * ifm_block_config[i] - 1) // strides[i + 1]) + 1, min_block_size[i]

155

)

156

remainder_size.append(remainder_dim)

157

158

# this will actually calculate reads into the edge padding.

159

160

# there are four cases in total, handling the edges that will not fill a complete block.

# 0000000001

# 0000000001

# 0000000001

# 0000000001

# 0000000001

# 0000000001

# 2222222223

total_blocks = 0

total_area = 0

block_setup = (

(n_normal_blocks[0] * n_normal_blocks[1], block_config),

174

(1 * n_normal_blocks[1], (remainder_size[0], block_config[1])),

175

(n_normal_blocks[0] * 1, (block_config[0], remainder_size[1])),

176

(1 * 1, remainder_size),

177

)

178

179

for n_blocks, block_size in block_setup:

180

if block_size[0] == 0 or block_size[1] == 0:

continue

read_dims = [0, 0]

for i in range(2):

read_dims[i] = (

numeric_util.round_up(clamped_skirt[i], ifm_brick_size[i + 1])

186

+ block_size[i] * strides[i + 1]

187

+ numeric_util.round_up(clamped_skirt[2 + i], ifm_brick_size[i + 1])

188

)

189

assert n_blocks >= 0

190

total_blocks += n_blocks

191

total_area += n_blocks * read_dims[0] * read_dims[1]

192

assert total_blocks >= 1

193

return total_blocks, total_area, block_setup

194

195

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

196

def get_ifm_block_depth(npu_block_type, ifm_depth, ifm_elemwidth, block_traversal, ofm_blk_depth):

197

ifm_blk_depth = ofm_blk_depth

198

199

if npu_block_type == NpuBlockType.ConvolutionMxN or npu_block_type == NpuBlockType.ReduceSum:

200

if ifm_elemwidth == 16 or block_traversal == TensorBlockTraversal.PartKernelFirst:

201

ifm_blk_depth = 16

202

elif ifm_elemwidth == 8:

ifm_blk_depth = 32

else:

ifm_blk_depth = 8

return min(ifm_depth, ifm_blk_depth)

208

209

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

210

def get_minimal_cmd_cycles(arch, ifm_tensor, ofm_tensor, ifm_blk: Block, ofm_blk: Block, output_cycles, dpu_cycles=0):

211

latencies_rd = {MemArea.Sram: 32, MemArea.Dram: 500, MemArea.OnChipFlash: 64, MemArea.OffChipFlash: 64}

212

latencies_wr = {MemArea.Sram: 32, MemArea.Dram: 250, MemArea.OnChipFlash: 64, MemArea.OffChipFlash: 64}

213

ifm_tens_blk = Tensor((1, ifm_blk.height, ifm_blk.width, ifm_blk.depth), ifm_tensor.dtype, "ifm_blk")

214

ofm_tens_blk = Tensor((1, ofm_blk.height, ofm_blk.width, ofm_blk.depth), ofm_tensor.dtype, "ofm_blk")

215

cycles_ifm_blk = (

216

estimate_memory_bandwidth(arch, ifm_tensor.mem_area, BandwidthDirection.Read, ifm_tens_blk, ifm_blk)

217

/ arch.memory_bandwidths_per_cycle[ifm_tensor.mem_area]

218

)

219

cycles_ofm_blk = (

220

estimate_memory_bandwidth(arch, ofm_tensor.mem_area, BandwidthDirection.Write, ofm_tens_blk, ofm_blk)

221

/ arch.memory_bandwidths_per_cycle[ofm_tensor.mem_area]

222

)

223

return (

224

latencies_rd[ifm_tensor.mem_area]

+ cycles_ifm_blk

+ dpu_cycles

+ output_cycles

+ latencies_wr[ofm_tensor.mem_area]

+ cycles_ofm_blk

) / 4

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

233

def estimate_output_cycles(

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

arch,

npu_block_type,

primary_op,

num_elems,

ifm_tensor,

ofm_tensor,

use_acc_40bits=False,

241

ifm2_tensor=None,

242

block_config: Block = None,

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

243

):

Louis Verhaard

e8a5a78

2020-11-02 18:04:27 +0100

[diff] [blame]

244

faf = None if primary_op.activation is None else primary_op.activation.op_type

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

245

if npu_block_type == NpuBlockType.ElementWise and ifm_tensor.dtype == DataType.int32:

246

if ifm2_tensor is None:

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

247

# Unary op

248

output_perf_index = 0

249

else:

250

# Binary op

251

output_perf_index = 1

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

252

elif primary_op.type == Op.Mul and ofm_tensor.dtype == DataType.int32:

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

253

output_perf_index = 2

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

254

elif primary_op.type == Op.Mul or (

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

255

npu_block_type

256

in (

257

NpuBlockType.ConvolutionMxN,

258

NpuBlockType.ConvolutionDepthWise,

259

NpuBlockType.Pooling,

260

NpuBlockType.ReduceSum,

261

NpuBlockType.VectorProduct,

262

)

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

263

and use_acc_40bits

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

264

):

265

output_perf_index = 3

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

266

elif primary_op.type in (Op.Add, Op.Sub):

267

input_scale = ifm_tensor.quantization.scale_f32

268

input2_scale = ifm2_tensor.quantization.scale_f32

269

output_scale = ofm_tensor.quantization.scale_f32

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

270

271

if "resizebilinear" in primary_op.attrs:

272

output_scale = input2_scale

273

274

if None in (input_scale, input2_scale, output_scale) or input_scale == input2_scale:

275

# Simple Add/Sub

276

output_perf_index = 4

277

else:

278

# Advanced Add/Sub

279

output_perf_index = 5

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

280

elif primary_op.type.is_maxpool_op():

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

281

output_perf_index = 6

282

else:

283

output_perf_index = 7

284

285

if faf in (Op.Sigmoid, Op.Tanh, Op.LUT):

286

activation_perf_index = 0

287

elif faf in (Op.Relu, Op.Relu6, Op.ReluN1To1):

288

activation_perf_index = 1

289

else:

290

activation_perf_index = 2

291

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

292

cycle_per_elem = max(

293

arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index]

294

)

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

295

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

296

if primary_op.type.is_elementwise_op() and block_config is not None:

297

num_elems_blk = block_config.width * block_config.height * block_config.depth

298

cycle_cmd = get_minimal_cmd_cycles(

299

arch, ifm_tensor, ofm_tensor, block_config, block_config, num_elems_blk * cycle_per_elem

300

)

301

cycle_per_elem = max(cycle_per_elem, cycle_cmd / num_elems_blk)

302

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

303

return num_elems * cycle_per_elem

304

305

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

306

def estimate_conv_pooling_cycles(

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

307

arch,

308

npu_block_type,

309

primary_op,

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

310

ifm_block: Block,

311

ofm_block: Block,

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

block_traversal,

kernel_dims,

ifm_tensor,

ofm_tensor,

scale_tensor=None,

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

317

):

Diqing Zhong

e5204a6

2020-10-13 11:42:37 +0200

[diff] [blame]

318

ofm_ublock = Block(arch.config.ofm_ublock.width, arch.config.ofm_ublock.height, arch.config.ofm_ublock.depth)

319

ifm_tens_shape = numeric_util.full_shape(4, ifm_tensor.shape, 1)

320

ofm_tens_shape = numeric_util.full_shape(4, ofm_tensor.shape, 1)

321

322

if (

323

arch.config.ofm_ublock.height == 2

324

and npu_block_type

325

in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)

326

and ofm_tens_shape[1] == 1

327

# Optimisation only applies for even width tensors

328

and ofm_tens_shape[2] % 2 == 0

329

and kernel_dims[0] == 1

330

):

331

ofm_ublock.width = 4

332

ofm_ublock.height = 1

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

333

ofm_block.height = 1

Diqing Zhong

e5204a6

2020-10-13 11:42:37 +0200

[diff] [blame]

334

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

335

num_ublk_x = numeric_util.round_up_divide(ofm_block.width, ofm_ublock.width)

336

num_ublk_y = ofm_block.height // ofm_ublock.height

337

num_ublk_xy = num_ublk_x * num_ublk_y

338

num_ublk_z = ofm_block.depth // ofm_ublock.depth

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

339

num_ofm_blk = 0

340

total_cycles = 0

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

341

num_elems_blk = ofm_block.width * ofm_block.height * ofm_block.depth

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

342

use_acc_40bits = is_acc_40bits_used(npu_block_type, ifm_tensor, ofm_tensor)

343

344

sub_kernel_limits = arch.sub_kernel_limits[npu_block_type]

345

n_sub_kernels_y = numeric_util.round_up_divide(kernel_dims[0], sub_kernel_limits[0])

346

n_sub_kernels_x = numeric_util.round_up_divide(kernel_dims[1], sub_kernel_limits[1])

347

sub_kernel_x = [

348

min((kernel_dims[1] - i * sub_kernel_limits[1]), sub_kernel_limits[1]) for i in range(n_sub_kernels_x)

349

]

350

sub_kernel_y = [

351

min((kernel_dims[0] - i * sub_kernel_limits[0]), sub_kernel_limits[0]) for i in range(n_sub_kernels_y)

352

]

353

sub_kernel_size = (x * y for y in sub_kernel_y for x in sub_kernel_x)

354

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

355

cycles_dpu_blk = 0

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

356

cycles_wb = 32 * ofm_ublock.depth // 8

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

357

358

for num_kernel_elems in sub_kernel_size:

359

if npu_block_type == NpuBlockType.Pooling:

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

360

num_kernel_steps = 1

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

361

cycles = max(4, num_kernel_elems) * num_ublk_xy * num_ublk_z

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

362

if ifm_tensor.dtype.size_in_bits() == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32:

363

cycles *= 2

364

elif npu_block_type == NpuBlockType.ConvolutionDepthWise:

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

365

cycles = 4 * num_ublk_xy

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

366

if ifm_tensor.dtype.size_in_bits() == 16:

367

cycles *= 2

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

368

num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, 4)

369

cycles = max(cycles_wb, cycles) * num_kernel_steps * num_ublk_z

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

370

elif (

371

(npu_block_type == NpuBlockType.ConvolutionMxN and block_traversal != TensorBlockTraversal.PartKernelFirst)

372

or npu_block_type == NpuBlockType.VectorProduct

373

or npu_block_type == NpuBlockType.ReduceSum

374

):

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

375

num_kernel_steps = num_kernel_elems

376

cycles = max(cycles_wb, 4 * num_ublk_xy) * num_kernel_steps * num_ublk_z

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

377

else:

378

assert block_traversal == TensorBlockTraversal.PartKernelFirst

379

divider = 2 if ifm_tensor.dtype.size_in_bits() == 16 else 4

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

380

num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, divider)

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

381

cycles = max(cycles_wb, 4 * num_ublk_xy) * (

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

382

num_kernel_steps * numeric_util.round_up_divide(ifm_block.depth, 8) * num_ublk_z

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

383

)

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

384

385

delay_cycles = 0

386

if arch.accelerator_config is Accelerator.Ethos_U55_32:

387

delay = 7 if use_acc_40bits else 3

388

if num_ublk_x == 1 and num_ublk_y == 1:

389

if num_ublk_z == 1:

390

delay_cycles = delay * num_kernel_steps

391

elif num_kernel_steps > 1:

392

delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z

393

if (num_ublk_x == 1 or num_ublk_y == 1) and num_ublk_z > 1 and use_acc_40bits:

394

delay_cycles += delay * num_ublk_z

else:

delay = (

3

if use_acc_40bits and arch.accelerator_config in (Accelerator.Ethos_U55_64, Accelerator.Ethos_U55_128)

399

else 2

400

)

401

if num_ublk_x == 1 and num_ublk_y == 1:

402

if num_ublk_z == 1:

403

delay_cycles = delay * num_kernel_steps

404

elif num_kernel_steps > 1:

405

delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z

406

407

if npu_block_type == NpuBlockType.ConvolutionMxN and block_traversal == TensorBlockTraversal.PartKernelFirst:

408

delay_cycles *= numeric_util.round_up_divide(ifm_block.depth, 8)

409

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

410

cycles_dpu_blk += cycles

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

411

cycles_dpu_blk += delay_cycles

412

413

if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):

414

cycles_dpu_blk *= numeric_util.round_up_divide(ifm_tens_shape[3], ifm_block.depth)

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

415

416

cycles_dpu_blk /= arch.ncores

417

418

num_ofm_blk = (

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

419

numeric_util.round_up_divide(ofm_tens_shape[1], ofm_block.height)

420

* numeric_util.round_up_divide(ofm_tens_shape[2], ofm_block.width)

421

* numeric_util.round_up_divide(ofm_tens_shape[3], ofm_block.depth)

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

422

)

423

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

424

cycles_output_blk = estimate_output_cycles(

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

425

arch, npu_block_type, primary_op, num_elems_blk, ifm_tensor, ofm_tensor, use_acc_40bits

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

426

)

427

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

428

if scale_tensor:

429

if scale_tensor.mem_area is MemArea.Sram:

430

latency = 32

431

elif scale_tensor.mem_area is MemArea.Dram:

432

latency = 500

433

else:

434

latency = 64

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

435

cycles_bias_blk = 10 * min(ofm_block.depth, ofm_tens_shape[3]) * latency / 256

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

436

cycles_output_blk = max(cycles_output_blk, cycles_bias_blk)

437

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

438

cycles_cmd = get_minimal_cmd_cycles(

439

arch, ifm_tensor, ofm_tensor, ifm_block, ofm_block, cycles_dpu_blk, cycles_output_blk

440

)

441

cycles_dpu_blk = max(cycles_dpu_blk, cycles_cmd)

442

cycles_output_blk = max(cycles_output_blk, cycles_cmd)

443

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

444

if cycles_dpu_blk > cycles_output_blk:

445

total_cycles = cycles_dpu_blk * num_ofm_blk + cycles_output_blk

446

else:

447

total_cycles = cycles_output_blk * num_ofm_blk + cycles_dpu_blk

return total_cycles

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

452

def estimate_memory_bandwidth(arch, mem_area, direction, tensor, block_size: Block, replace_bw=None):

453

if tensor.format not in (TensorFormat.NHWC, TensorFormat.NHCWB16):

454

return tensor.bandwidth() if replace_bw is None else replace_bw

455

456

# Estimate memory transfer efficiency by calculating the burst length

457

# this is related to data format, block shape, and tensor shape, etc.

458

max_burst_len = 32 if mem_area == MemArea.Sram else 128

459

burst_len = 0

460

elem_size = tensor.dtype.size_in_bytes()

461

is_ifm = direction == BandwidthDirection.Read

462

tens = tensor.clone()

463

if not tens.avoid_NHCWB16:

464

tens.set_format(TensorFormat.NHCWB16, arch)

465

466

if tens.format == TensorFormat.NHCWB16:

467

if tens.get_strides()[1] == block_size.depth:

468

burst_len = elem_size * block_size.depth * block_size.width

469

elif is_ifm:

470

burst_len = 16 * elem_size * block_size.width

471

else:

472

burst_len = 16 * elem_size * block_size.width * arch.ncores

473

else:

474

assert tens.format == TensorFormat.NHWC

475

if is_ifm:

476

if tens.get_strides()[3] == block_size.depth:

477

burst_len = elem_size * block_size.depth * block_size.width

478

else:

479

burst_len = elem_size * block_size.depth

480

else:

481

if block_size.depth <= 16 and tens.get_strides()[3] == block_size.depth:

482

burst_len = elem_size * block_size.depth * block_size.width

483

else:

484

burst_len = min(64, 16 * elem_size * arch.ncores, block_size.depth * elem_size)

485

486

burst_len = min(max_burst_len, burst_len)

487

bw = tens.bandwidth() if replace_bw is None else replace_bw

488

489

return bw * (max_burst_len / burst_len)

490

491

Michael McGeagh

6f72526

2020-12-03 15:21:36 +0000

[diff] [blame]

492

def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None, force_outputs_to_fast_storage=False):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

493

if block_config is None:

494

block_config = ps.block_config

495

bws = make_bandwidth_array()

496

macs = make_macs_array()

497

cycles = make_cycles_array()

498

blocks = 0

499

ifm_read_multiple = 1

500

weight_read_multiple = 0

501

Michael McGeagh

2020-12-02 12:39:03 +0000

[diff] [blame^]

502

if ps.placement in (PassPlacement.MemoryOnly, PassPlacement.StartupInit):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

503

return bws, macs, cycles, blocks, ifm_read_multiple, weight_read_multiple # nothing real happening in this pass

504

505

min_block_size = arch.min_block_sizes[ps.npu_block_type]

506

507

skirt = (0, 0, 0, 0)

508

explicit_padding = (0, 0, 0, 0)

509

primary_op = ps.primary_op

510

replacement_read_bws = {}

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

511

ofm_block = Block(block_config[1], block_config[0], block_config[3])

512

ifm_block = Block(block_config[1], block_config[0], block_config[3])

513

Tim Hall

1bd531d

2020-11-01 20:59:36 +0000

[diff] [blame]

514

if ps.placement == PassPlacement.Npu and primary_op:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

515

skirt = primary_op.attrs.get("skirt", skirt)

516

explicit_padding = primary_op.attrs.get("explicit_padding", explicit_padding)

Louis Verhaard

aee5d75

2020-09-30 09:01:52 +0200

[diff] [blame]

517

assert primary_op.type.npu_block_type == ps.npu_block_type

518

npu_block_type = primary_op.type.npu_block_type

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

519

520

ifm_tensor, _, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

521

ifm_tensor_shape = numeric_util.full_shape(4, ifm_tensor.shape, 1)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

522

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

523

if npu_block_type == NpuBlockType.ReduceSum:

524

block_traversal = TensorBlockTraversal.DepthFirst

525

elif npu_block_type in (

526

NpuBlockType.ConvolutionMxN,

527

NpuBlockType.ConvolutionDepthWise,

528

NpuBlockType.VectorProduct,

529

):

530

block_traversal = weight_tensor.block_traversal

531

else:

532

block_traversal = TensorBlockTraversal.Default

533

ifm_block_depth = get_ifm_block_depth(

534

npu_block_type, ifm_tensor_shape[3], ifm_tensor.dtype.size_in_bits(), block_traversal, ofm_block.depth

535

)

536

ifm_block = arch.get_ifm_block_size(

537

ifm_block_depth, ofm_block, primary_op.kernel, ifm_resampling_mode=ifm_tensor.resampling_mode

538

)

539

Michael McGeagh

2020-12-02 12:39:03 +0000

[diff] [blame^]

540

if npu_block_type in (

541

NpuBlockType.ConvolutionMxN,

542

NpuBlockType.ConvolutionDepthWise,

543

NpuBlockType.Pooling,

544

NpuBlockType.ReduceSum,

Tim Hall

c30f495

2020-06-15 20:47:35 +0100

[diff] [blame]

545

):

Charles Xu

2020-04-22 08:31:43 +0200

[diff] [blame]

546

# extent the ifm to full dimension

547

ifm_tensor_brick_size = tuple(numeric_util.full_shape(4, list(ifm_tensor.brick_size), 1))

Charles Xu

2020-04-22 08:31:43 +0200

[diff] [blame]

548

ifm_tensor_bandwidth_shape = numeric_util.full_shape(4, ifm_tensor.bandwidth_shape, 1)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

549

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

550

batch_size = ifm_tensor_shape[0]

Charles Xu

2020-04-22 08:31:43 +0200

[diff] [blame]

551

ifm_depth = ifm_tensor_bandwidth_shape[3]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

552

553

# add in padding

554

ifm_tensor_shape[1] += explicit_padding[0] + explicit_padding[2] # height += top and bottom

555

ifm_tensor_shape[2] += explicit_padding[1] + explicit_padding[3] # width += left and right

556

557

strides = primary_op.attrs["strides"]

558

if npu_block_type != NpuBlockType.Pooling:

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

559

if npu_block_type == NpuBlockType.ReduceSum:

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

560

weight_tensor_shape = [1, 1, ifm_tensor.shape[3], ofm_tensor.shape[3]]

561

weight_tensor_bandwidth_shape = [0] * 4

562

weight_tensor_element_size = 0

563

weight_tensor_bandwidth_compression_scale = 0.0

564

else:

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

565

weight_tensor_shape = weight_tensor.shape

566

weight_tensor_bandwidth_shape = weight_tensor.bandwidth_shape

567

weight_tensor_element_size = weight_tensor.element_size()

568

weight_tensor_bandwidth_compression_scale = weight_tensor.bandwidth_compression_scale

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

569

nn_ops = (

570

int(ofm_tensor.shape[0])

571

* int(ofm_tensor.shape[1])

572

* int(ofm_tensor.shape[2])

573

* int(weight_tensor_shape[0])

574

* int(weight_tensor_shape[1])

575

* int(weight_tensor_shape[2])

576

* int(weight_tensor_shape[3])

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

577

)

578

else:

579

weight_tensor_shape = [

580

primary_op.attrs["ksize"][1],

581

primary_op.attrs["ksize"][2],

1,

ifm_tensor_shape[3],

]

weight_tensor_bandwidth_shape = weight_tensor_shape

586

weight_tensor_element_size = 0

587

weight_tensor_bandwidth_compression_scale = 0.0

588

nn_ops = 0 # pooling doesn't count as NN ops

589

590

kernel_dims = weight_tensor_shape[:2]

591

592

sub_kernel_limits = arch.sub_kernel_limits[npu_block_type]

593

# count the sub kernels; the IFM block needs to be refetched for each of them

594

n_sub_kernels_y = numeric_util.round_up_divide(kernel_dims[0], sub_kernel_limits[0])

595

n_sub_kernels_x = numeric_util.round_up_divide(kernel_dims[1], sub_kernel_limits[1])

596

n_sub_kernels = n_sub_kernels_y * n_sub_kernels_x

597

598

clamped_skirt = list(skirt)

599

clamped_skirt[2] = min(clamped_skirt[2], sub_kernel_limits[0] - 1 - clamped_skirt[0])

600

clamped_skirt[3] = min(clamped_skirt[3], sub_kernel_limits[1] - 1 - clamped_skirt[1])

601

n_blocks, area, block_setup = get_n_blocks_and_area(

Charles Xu

2020-04-22 08:31:43 +0200

[diff] [blame]

602

ifm_tensor_brick_size,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

603

ifm_tensor_shape[1:3],

skirt,

clamped_skirt,

block_config,

min_block_size,

strides,

)

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

611

blocks = n_blocks * numeric_util.round_up_divide(weight_tensor_shape[3], ofm_block.depth)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

612

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

613

n_weight_stages = numeric_util.round_up_divide(weight_tensor_bandwidth_shape[3], ofm_block.depth)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

614

if npu_block_type == NpuBlockType.ConvolutionDepthWise or npu_block_type == NpuBlockType.Pooling:

615

n_weight_stages = 1 # force to no reread

ifm_tensor_bw = (

n_sub_kernels

* batch_size

* area

* ifm_depth

* n_weight_stages

* ifm_tensor.element_size()

624

* ifm_tensor.bandwidth_compression_scale

625

)

626

replacement_read_bws[ifm_tensor] = ifm_tensor_bw

627

ifm_read_multiple = n_weight_stages

628

629

replacement_read_bws[weight_tensor] = (

630

batch_size

631

* shape_num_elements(weight_tensor_bandwidth_shape)

632

* weight_tensor_element_size

633

* weight_tensor_bandwidth_compression_scale

634

* n_blocks

635

) # read once per block and batch

636

weight_read_multiple = n_blocks

637

638

n_kernel_xy = kernel_dims[0] * kernel_dims[1]

639

n_input_channels_at_a_time = block_config[2]

640

Michael McGeagh

2020-12-02 12:39:03 +0000

[diff] [blame^]

641

if (npu_block_type == NpuBlockType.Pooling) or (

642

block_traversal in (TensorBlockTraversal.PartKernelFirst, TensorBlockTraversal.DepthWise)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

643

):

644

n_input_channels_at_a_time = numeric_util.round_up_divide(n_input_channels_at_a_time, 4)

645

n_kernel_xy = max(

646

n_kernel_xy, 4

647

) # need at least 4, as this is the minimum duty cycle for secondary accumulator writes

648

if weight_tensor is not None:

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

649

n_kernel_xy = numeric_util.round_up(n_kernel_xy, 4) # weights need to be read in blocks of 4

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

650

651

num_mac_ops = 0

652

for n_blocks_for_size, block_size in block_setup:

num_mac_ops += (

batch_size

* n_blocks_for_size

* block_size[0]

* block_size[1]

* numeric_util.round_up(weight_tensor_shape[2], n_input_channels_at_a_time)

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

659

* numeric_util.round_up(weight_tensor_shape[3], ofm_block.depth)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

660

* n_kernel_xy

661

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

662

macs[MacCount.NeuralNetworkMacs] += nn_ops

663

macs[MacCount.HardwareMacs] += num_mac_ops

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

664

cycles[PassCycles.Npu] = estimate_conv_pooling_cycles(

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

665

arch,

666

npu_block_type,

667

primary_op,

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

668

ifm_block,

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

ofm_block,

block_traversal,

kernel_dims,

ifm_tensor,

ofm_tensor,

ps.scale_tensor,

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

675

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

676

elif npu_block_type == NpuBlockType.VectorProduct:

677

nn_macs = (

678

ifm_tensor.shape[0]

679

* numeric_util.round_up(weight_tensor.shape[-2], block_config[2])

680

* numeric_util.round_up(weight_tensor.shape[-1], block_config[3])

681

)

682

num_mac_ops = nn_macs

683

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

684

cycles[PassCycles.Npu] = estimate_conv_pooling_cycles(

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

685

arch, npu_block_type, primary_op, ifm_block, ofm_block, block_traversal, [1, 1], ifm_tensor, ofm_tensor,

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

686

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

687

macs[MacCount.NeuralNetworkMacs] += nn_macs

688

macs[MacCount.HardwareMacs] += num_mac_ops

689

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

690

blocks = 1 * numeric_util.round_up_divide(weight_tensor.shape[-1], ofm_block.depth)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

691

692

non_zero_fraction = 1.0

693

if ifm_tensor.values is not None:

694

nz_vector = np.amax(ifm_tensor.values != 0, axis=0) # max across batch axis

695

non_zero_fraction = np.average(nz_vector)

696

697

replacement_read_bws[ifm_tensor] = ifm_tensor.bandwidth()

698

replacement_read_bws[weight_tensor] = weight_tensor.bandwidth() * non_zero_fraction

699

ifm_read_multiple = 1

700

weight_read_multiple = non_zero_fraction

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

701

elif npu_block_type == NpuBlockType.ElementWise:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

702

# Work out how many elements we have and calculate performance.

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

703

cycles[PassCycles.Npu] = estimate_output_cycles(

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

arch,

npu_block_type,

primary_op,

ofm_tensor.elements(),

ps.ifm_tensor,

ps.ofm_tensor,

None,

ps.ifm2_tensor,

ofm_block,

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

713

)

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

714

715

prev_npu_pass = next((npu_ps for npu_ps in ps.dag_predecessors if npu_ps.placement is PassPlacement.Npu), None)

716

if prev_npu_pass is None:

717

# cycles for DMA ops in first pass

718

dma_ops = (op for op in ps.ops if op.type == Op.DMA)

719

for dma_op in dma_ops:

720

mem_area = dma_op.attrs["source"]

721

for tens in dma_op.inputs:

722

cycles[PassCycles.Npu] += tens.storage_size() / arch.memory_bandwidths_per_cycle[mem_area]

723

Michael McGeagh

6f72526

2020-12-03 15:21:36 +0000

[diff] [blame]

724

if rewrite_list is not None:

725

# apply the desired rewrites

726

for rewrite_op, tens, _, _, _, ps_to_rewrite in rewrite_list:

727

if ps != ps_to_rewrite:

728

continue

729

if rewrite_op == SchedulerRewrite.Nop:

730

pass # these are fine, no bandwidth changes

731

elif rewrite_op in (SchedulerRewrite.ChangeTensorSubPurpose,):

732

if tens.purpose == TensorPurpose.FeatureMap:

733

bw = estimate_memory_bandwidth(

734

arch,

735

arch.fast_storage_mem_area,

736

BandwidthDirection.Read,

737

tens,

738

ifm_block,

739

replacement_read_bws[tens],

740

)

741

else:

742

bw = replacement_read_bws[tens]

743

bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Read] += bw

744

replacement_read_bws[tens] = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

745

746

for tens in ps.outputs:

747

if force_outputs_to_fast_storage:

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

748

bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Write] += estimate_memory_bandwidth(

749

arch, arch.fast_storage_mem_area, BandwidthDirection.Write, tens, ofm_block

750

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

751

else:

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

752

bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += estimate_memory_bandwidth(

753

arch, tens.mem_area, BandwidthDirection.Write, tens, ofm_block

754

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

755

756

for tens in ps.intermediates:

757

bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()

758

759

if tens in replacement_read_bws:

760

bw = replacement_read_bws[tens]

761

else:

762

bw = tens.bandwidth()

763

764

bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw

765

766

for tens in ps.inputs:

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

767

bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += estimate_memory_bandwidth(

768

arch, tens.mem_area, BandwidthDirection.Read, tens, ifm_block, replacement_read_bws.get(tens)

769

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

770

771

# quick build access counts for only current pass, even though these aren't the final numbers

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

772

update_summary_cycles(arch, bws, cycles)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

773

774

return bws, macs, cycles, blocks, ifm_read_multiple, weight_read_multiple

775

776

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

777

def update_summary_cycles(arch, bws, cycles):

778

cycles[PassCycles.SramAccess] = np.sum(bws[MemArea.Sram]) / arch.memory_bandwidths_per_cycle[MemArea.Sram]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

779

cycles[PassCycles.DramAccess] = np.sum(bws[MemArea.Dram]) / arch.memory_bandwidths_per_cycle[MemArea.Dram]

780

cycles[PassCycles.OnChipFlashAccess] = (

781

np.sum(bws[MemArea.OnChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OnChipFlash]

782

)

783

cycles[PassCycles.OffChipFlashAccess] = (

784

np.sum(bws[MemArea.OffChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OffChipFlash]

785

)

786

787

cycles[PassCycles.Total] = np.max(cycles[: PassCycles.Total])

return cycles

def collate_stats_for_cascaded_pass(arch, bws, macs, cycles):

792

return bws, macs, cycles

793

794

795

def performance_for_cascaded_pass(arch, cps):

796

total_bws = make_bandwidth_array()

797

total_macs = make_macs_array()

798

total_cycles = make_cycles_array()

799

800

for ps in cps.passes:

801

bws, macs, cycles, blocks, _, _ = performance_metrics_for_pass(arch, ps)

ps.bandwidths = bws

ps.macs = macs

ps.cycles = cycles

ps.n_blocks = blocks

total_bws += bws

total_macs += macs

total_cycles += cycles

809

810

bws, macs, cycles = collate_stats_for_cascaded_pass(arch, total_bws, total_macs, total_cycles)

cps.bandwidths = bws

cps.macs = macs

cps.cycles = cycles

return bws, macs, cycles

815

816

817

def calc_performance_for_network(nng, arch):

818

total_bws = make_bandwidth_array()

819

total_macs = np.zeros(MacCount.Size)

820

total_cycles = np.zeros(PassCycles.Size)

821

822

for sg in nng.subgraphs:

823

for cps in sg.cascaded_passes:

824

bws, macs, cycles = performance_for_cascaded_pass(arch, cps)

825

total_bws += bws

826

total_macs += macs

827

total_cycles += cycles

Tim Hall