Blame - ethosu/vela/npu_performance.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

16

# Description:

17

# NPU performance estimation functions to estimate performance of a Pass and CascadedPass. Uses a model that takes the

18

# maximum of the 'cycles required for bandwidth' and 'cycles required for computing'.

19

#

20

# Called during scheduling to evaluate different proposals, as well as post-scheduling to provide a final performance

21

# estimate.

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

22

from enum import auto

23

from enum import IntEnum

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

24

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

25

import numpy as np

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

26

27

from . import numeric_util

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

28

from .architecture_features import Accelerator

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

29

from .architecture_features import Block

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

30

from .data_type import DataType

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

31

from .nn_graph import PassPlacement

32

from .nn_graph import SchedulerRewrite

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

33

from .operation import NpuBlockType

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

34

from .operation import Op

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

35

from .shared_buffer_allocation import is_acc_40bits_used

Diqing Zhong

2020-12-11 13:07:37 +0100

[diff] [blame]

36

from .tensor import BandwidthDirection

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

37

from .tensor import MemArea

38

from .tensor import shape_num_elements

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

39

from .tensor import Tensor

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

40

from .tensor import TensorBlockTraversal

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

41

from .tensor import TensorFormat

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

42

from .tensor import TensorPurpose

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

43

44

45

def rolling_buffer_dims_from_passes(arch, ps1, block_config_ps1, ps2, block_config_ps2):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

46

ofm_block = Block(block_config_ps2[-3], block_config_ps2[-4], block_config_ps2[-1])

Tim Hall

4ed38bc

2020-10-20 18:54:20 +0100

[diff] [blame]

47

kernel = ps2.primary_op.kernel

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

48

Michael McGeagh

2020-12-02 12:39:03 +0000

[diff] [blame]

49

if ps2.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct):

Louis Verhaard

93dc553

2020-06-07 12:40:18 +0200

[diff] [blame]

50

op = ps2.primary_op

patrik.gustavsson

2020-12-21 17:10:40 +0000

[diff] [blame]

51

ifm_block_depth = arch.calc_ifm_block_depth(op.ifm_shapes[0].depth, op.ifm.dtype.size_in_bits())

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

52

else:

53

ifm_block_depth = block_config_ps2[-1]

54

Louis Verhaard

93dc553

2020-06-07 12:40:18 +0200

[diff] [blame]

55

ifm_block = arch.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, arch.ofm_block_max)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

56

57

# The performed height calculation is for worst case

58

height = numeric_util.round_up(ifm_block.height + block_config_ps1[0], block_config_ps1[0])

59

width = ifm_block.width

Louis Verhaard

93dc553

2020-06-07 12:40:18 +0200

[diff] [blame]

60

return [height, width]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

61

62

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

63

class PassCycles(IntEnum):

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

64

Npu = 0

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

65

SramAccess = auto()

66

DramAccess = auto()

67

OnChipFlashAccess = auto()

68

OffChipFlashAccess = auto()

69

Total = auto()

70

Size = auto()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

71

72

def display_name(self):

Tim Hall

1bd531d

2020-11-01 20:59:36 +0000

[diff] [blame]

73

return ("NPU", "SRAM Access", "DRAM Access", "On-chip Flash Access", "Off-chip Flash Access", "Total", "Size",)[

74

self.value

75

]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

76

77

def identifier_name(self):

Tim Hall

1bd531d

2020-11-01 20:59:36 +0000

[diff] [blame]

78

return ("npu", "sram_access", "dram_access", "on_chip_flash_access", "off_chip_flash_access", "total", "size",)[

79

self.value

80

]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

@staticmethod

def all():

return (

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

85

PassCycles.Npu,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

86

PassCycles.SramAccess,

87

PassCycles.DramAccess,

88

PassCycles.OnChipFlashAccess,

89

PassCycles.OffChipFlashAccess,

PassCycles.Total,

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

94

def make_bandwidth_array():

95

return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size))

96

97

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

98

def make_cycles_array():

99

return np.zeros(PassCycles.Size)

100

101

102

def make_metrics_arrays():

Diqing Zhong

2020-12-08 13:08:48 +0100

[diff] [blame]

103

return (make_bandwidth_array(), 0, make_cycles_array())

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

104

105

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

106

def get_ifm_block_depth(npu_block_type, ifm_depth, ifm_elemwidth, block_traversal, ofm_blk_depth):

107

ifm_blk_depth = ofm_blk_depth

108

Diqing Zhong

2020-12-08 13:08:48 +0100

[diff] [blame]

109

if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

110

if ifm_elemwidth == 16 or block_traversal == TensorBlockTraversal.PartKernelFirst:

111

ifm_blk_depth = 16

112

elif ifm_elemwidth == 8:

ifm_blk_depth = 32

else:

ifm_blk_depth = 8

return min(ifm_depth, ifm_blk_depth)

118

119

Patrik Gustavsson

2021-01-21 08:28:55 +0100

[diff] [blame]

120

def get_minimal_cmd_cycles(

121

arch, ifm_tensor, ofm_tensor, ifm_blk: Block, ofm_blk: Block, output_cycles, ifm_shape4D, ofm_shape4D, dpu_cycles=0

122

):

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

123

ifm_tens_blk = Tensor((1, ifm_blk.height, ifm_blk.width, ifm_blk.depth), ifm_tensor.dtype, "ifm_blk")

124

ofm_tens_blk = Tensor((1, ofm_blk.height, ofm_blk.width, ofm_blk.depth), ofm_tensor.dtype, "ofm_blk")

125

cycles_ifm_blk = (

Patrik Gustavsson

2021-01-21 08:28:55 +0100

[diff] [blame]

126

estimate_memory_transfer_efficiency(

127

arch, ifm_tensor.mem_area, BandwidthDirection.Read, ifm_tens_blk, ifm_blk, shape4D=ifm_shape4D

128

)

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

129

/ arch.memory_bandwidths_per_cycle[ifm_tensor.mem_area]

130

)

131

cycles_ofm_blk = (

Patrik Gustavsson

2021-01-21 08:28:55 +0100

[diff] [blame]

132

estimate_memory_transfer_efficiency(

133

arch, ofm_tensor.mem_area, BandwidthDirection.Write, ofm_tens_blk, ofm_blk, shape4D=ofm_shape4D

134

)

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

135

/ arch.memory_bandwidths_per_cycle[ofm_tensor.mem_area]

136

)

137

return (

Diqing Zhong

2020-12-11 13:07:37 +0100

[diff] [blame]

138

arch.memory_latency[ifm_tensor.mem_area][BandwidthDirection.Read]

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

139

+ cycles_ifm_blk

140

+ dpu_cycles

141

+ output_cycles

Diqing Zhong

2020-12-11 13:07:37 +0100

[diff] [blame]

142

+ arch.memory_latency[ofm_tensor.mem_area][BandwidthDirection.Write]

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

+ cycles_ofm_blk

) / 4

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

147

def estimate_output_cycles(

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

arch,

npu_block_type,

primary_op,

num_elems,

ifm_tensor,

ofm_tensor,

use_acc_40bits=False,

155

ifm2_tensor=None,

156

block_config: Block = None,

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

157

):

Louis Verhaard

e8a5a78

2020-11-02 18:04:27 +0100

[diff] [blame]

158

faf = None if primary_op.activation is None else primary_op.activation.op_type

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

159

if npu_block_type == NpuBlockType.ElementWise and ifm_tensor.dtype == DataType.int32:

160

if ifm2_tensor is None:

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

161

# Unary op

162

output_perf_index = 0

163

else:

164

# Binary op

165

output_perf_index = 1

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

166

elif primary_op.type == Op.Mul and ofm_tensor.dtype == DataType.int32:

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

167

output_perf_index = 2

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

168

elif primary_op.type == Op.Mul or (

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

169

npu_block_type

170

in (

171

NpuBlockType.ConvolutionMxN,

172

NpuBlockType.ConvolutionDepthWise,

173

NpuBlockType.Pooling,

174

NpuBlockType.ReduceSum,

175

NpuBlockType.VectorProduct,

176

)

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

177

and use_acc_40bits

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

178

):

179

output_perf_index = 3

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

180

elif primary_op.type in (Op.Add, Op.Sub):

181

input_scale = ifm_tensor.quantization.scale_f32

182

input2_scale = ifm2_tensor.quantization.scale_f32

183

output_scale = ofm_tensor.quantization.scale_f32

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

184

185

if "resizebilinear" in primary_op.attrs:

186

output_scale = input2_scale

187

188

if None in (input_scale, input2_scale, output_scale) or input_scale == input2_scale:

189

# Simple Add/Sub

190

output_perf_index = 4

191

else:

192

# Advanced Add/Sub

193

output_perf_index = 5

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

194

elif primary_op.type.is_maxpool_op():

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

195

output_perf_index = 6

196

else:

197

output_perf_index = 7

198

199

if faf in (Op.Sigmoid, Op.Tanh, Op.LUT):

200

activation_perf_index = 0

201

elif faf in (Op.Relu, Op.Relu6, Op.ReluN1To1):

202

activation_perf_index = 1

203

else:

204

activation_perf_index = 2

205

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

206

cycle_per_elem = max(

207

arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index]

208

)

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

209

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

210

if primary_op.type.is_elementwise_op() and block_config is not None:

211

num_elems_blk = block_config.width * block_config.height * block_config.depth

212

cycle_cmd = get_minimal_cmd_cycles(

Patrik Gustavsson

2021-01-21 08:28:55 +0100

[diff] [blame]

arch,

ifm_tensor,

ofm_tensor,

block_config,

block_config,

num_elems_blk * cycle_per_elem,

219

primary_op.ifm_shapes[0],

220

primary_op.ofm_shapes[0],

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

221

)

222

cycle_per_elem = max(cycle_per_elem, cycle_cmd / num_elems_blk)

223

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

224

return num_elems * cycle_per_elem

225

226

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

227

def estimate_conv_pooling_cycles(

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

228

arch,

229

npu_block_type,

230

primary_op,

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

231

ifm_block: Block,

232

ofm_block: Block,

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

block_traversal,

kernel_dims,

ifm_tensor,

ofm_tensor,

scale_tensor=None,

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

238

):

Diqing Zhong

2020-10-13 11:42:37 +0200

[diff] [blame]

239

ofm_ublock = Block(arch.config.ofm_ublock.width, arch.config.ofm_ublock.height, arch.config.ofm_ublock.depth)

Patrik Gustavsson

2349d42

2020-12-01 16:02:29 +0100

[diff] [blame]

240

ifm_tens_shape = primary_op.ifm_shapes[0]

241

ofm_tens_shape = primary_op.ofm_shapes[0]

Diqing Zhong

2020-10-13 11:42:37 +0200

[diff] [blame]

242

243

if (

244

arch.config.ofm_ublock.height == 2

245

and npu_block_type

246

in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)

patrik.gustavsson

2020-12-21 17:10:40 +0000

[diff] [blame]

247

and ofm_tens_shape.height == 1

Diqing Zhong

2020-10-13 11:42:37 +0200

[diff] [blame]

248

# Optimisation only applies for even width tensors

patrik.gustavsson

2020-12-21 17:10:40 +0000

[diff] [blame]

249

and ofm_tens_shape.width % 2 == 0

Diqing Zhong

2020-10-13 11:42:37 +0200

[diff] [blame]

250

and kernel_dims[0] == 1

251

):

252

ofm_ublock.width = 4

253

ofm_ublock.height = 1

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

254

ofm_block.height = 1

Diqing Zhong

2020-10-13 11:42:37 +0200

[diff] [blame]

255

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

256

num_ublk_x = numeric_util.round_up_divide(ofm_block.width, ofm_ublock.width)

257

num_ublk_y = ofm_block.height // ofm_ublock.height

258

num_ublk_xy = num_ublk_x * num_ublk_y

259

num_ublk_z = ofm_block.depth // ofm_ublock.depth

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

260

num_ofm_blk = 0

261

total_cycles = 0

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

262

num_elems_blk = ofm_block.width * ofm_block.height * ofm_block.depth

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

263

use_acc_40bits = is_acc_40bits_used(npu_block_type, ifm_tensor, ofm_tensor)

264

265

sub_kernel_limits = arch.sub_kernel_limits[npu_block_type]

266

n_sub_kernels_y = numeric_util.round_up_divide(kernel_dims[0], sub_kernel_limits[0])

267

n_sub_kernels_x = numeric_util.round_up_divide(kernel_dims[1], sub_kernel_limits[1])

268

sub_kernel_x = [

269

min((kernel_dims[1] - i * sub_kernel_limits[1]), sub_kernel_limits[1]) for i in range(n_sub_kernels_x)

270

]

271

sub_kernel_y = [

272

min((kernel_dims[0] - i * sub_kernel_limits[0]), sub_kernel_limits[0]) for i in range(n_sub_kernels_y)

273

]

274

sub_kernel_size = (x * y for y in sub_kernel_y for x in sub_kernel_x)

275

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

276

cycles_dpu_blk = 0

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

277

cycles_wb = 32 * ofm_ublock.depth // 8

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

278

279

for num_kernel_elems in sub_kernel_size:

280

if npu_block_type == NpuBlockType.Pooling:

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

281

num_kernel_steps = 1

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

282

cycles = max(4, num_kernel_elems) * num_ublk_xy * num_ublk_z

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

283

if ifm_tensor.dtype.size_in_bits() == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32:

284

cycles *= 2

285

elif npu_block_type == NpuBlockType.ConvolutionDepthWise:

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

286

cycles = 4 * num_ublk_xy

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

287

if ifm_tensor.dtype.size_in_bits() == 16:

288

cycles *= 2

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

289

num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, 4)

290

cycles = max(cycles_wb, cycles) * num_kernel_steps * num_ublk_z

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

291

elif (

292

(npu_block_type == NpuBlockType.ConvolutionMxN and block_traversal != TensorBlockTraversal.PartKernelFirst)

293

or npu_block_type == NpuBlockType.VectorProduct

294

or npu_block_type == NpuBlockType.ReduceSum

295

):

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

296

num_kernel_steps = num_kernel_elems

297

cycles = max(cycles_wb, 4 * num_ublk_xy) * num_kernel_steps * num_ublk_z

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

298

else:

299

assert block_traversal == TensorBlockTraversal.PartKernelFirst

300

divider = 2 if ifm_tensor.dtype.size_in_bits() == 16 else 4

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

301

num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, divider)

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

302

cycles = max(cycles_wb, 4 * num_ublk_xy) * (

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

303

num_kernel_steps * numeric_util.round_up_divide(ifm_block.depth, 8) * num_ublk_z

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

304

)

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

305

306

delay_cycles = 0

307

if arch.accelerator_config is Accelerator.Ethos_U55_32:

308

delay = 7 if use_acc_40bits else 3

309

if num_ublk_x == 1 and num_ublk_y == 1:

310

if num_ublk_z == 1:

311

delay_cycles = delay * num_kernel_steps

312

elif num_kernel_steps > 1:

313

delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z

314

if (num_ublk_x == 1 or num_ublk_y == 1) and num_ublk_z > 1 and use_acc_40bits:

315

delay_cycles += delay * num_ublk_z

else:

delay = (

3

if use_acc_40bits and arch.accelerator_config in (Accelerator.Ethos_U55_64, Accelerator.Ethos_U55_128)

320

else 2

321

)

322

if num_ublk_x == 1 and num_ublk_y == 1:

323

if num_ublk_z == 1:

324

delay_cycles = delay * num_kernel_steps

325

elif num_kernel_steps > 1:

326

delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z

327

328

if npu_block_type == NpuBlockType.ConvolutionMxN and block_traversal == TensorBlockTraversal.PartKernelFirst:

329

delay_cycles *= numeric_util.round_up_divide(ifm_block.depth, 8)

330

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

331

cycles_dpu_blk += cycles

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

332

cycles_dpu_blk += delay_cycles

333

334

if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):

patrik.gustavsson

2020-12-21 17:10:40 +0000

[diff] [blame]

335

cycles_dpu_blk *= numeric_util.round_up_divide(ifm_tens_shape.depth, ifm_block.depth)

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

336

337

cycles_dpu_blk /= arch.ncores

338

339

num_ofm_blk = (

patrik.gustavsson

2020-12-21 17:10:40 +0000

[diff] [blame]

340

numeric_util.round_up_divide(ofm_tens_shape.height, ofm_block.height)

341

* numeric_util.round_up_divide(ofm_tens_shape.width, ofm_block.width)

342

* numeric_util.round_up_divide(ofm_tens_shape.depth, ofm_block.depth)

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

343

)

344

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

345

cycles_output_blk = estimate_output_cycles(

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

346

arch, npu_block_type, primary_op, num_elems_blk, ifm_tensor, ofm_tensor, use_acc_40bits

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

347

)

348

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

349

if scale_tensor:

Diqing Zhong

2020-12-11 13:07:37 +0100

[diff] [blame]

350

cycles_bias_blk = (

351

10

patrik.gustavsson

2020-12-21 17:10:40 +0000

[diff] [blame]

352

* min(ofm_block.depth, ofm_tens_shape.depth)

Diqing Zhong

2020-12-11 13:07:37 +0100

[diff] [blame]

353

* arch.memory_latency[scale_tensor.mem_area][BandwidthDirection.Read]

354

/ 256

355

)

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

356

cycles_output_blk = max(cycles_output_blk, cycles_bias_blk)

357

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

358

cycles_cmd = get_minimal_cmd_cycles(

Patrik Gustavsson

2021-01-21 08:28:55 +0100

[diff] [blame]

arch,

ifm_tensor,

ofm_tensor,

ifm_block,

ofm_block,

cycles_dpu_blk,

ifm_tens_shape,

ofm_tens_shape,

cycles_output_blk,

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

368

)

369

cycles_dpu_blk = max(cycles_dpu_blk, cycles_cmd)

370

cycles_output_blk = max(cycles_output_blk, cycles_cmd)

371

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

372

if cycles_dpu_blk > cycles_output_blk:

373

total_cycles = cycles_dpu_blk * num_ofm_blk + cycles_output_blk

374

else:

375

total_cycles = cycles_output_blk * num_ofm_blk + cycles_dpu_blk

return total_cycles

Patrik Gustavsson

2021-01-21 08:28:55 +0100

[diff] [blame]

380

def estimate_memory_transfer_efficiency(

381

arch, mem_area, direction, tensor, block_size: Block, replace_bw=None, shape4D=None

382

):

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

383

if tensor.format not in (TensorFormat.NHWC, TensorFormat.NHCWB16):

384

return tensor.bandwidth() if replace_bw is None else replace_bw

385

386

# Estimate memory transfer efficiency by calculating the burst length

387

# this is related to data format, block shape, and tensor shape, etc.

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

388

burst_len = 0

389

elem_size = tensor.dtype.size_in_bytes()

390

is_ifm = direction == BandwidthDirection.Read

391

tens = tensor.clone()

Patrik Gustavsson

ee99bb1

2021-04-08 09:04:00 +0200

[diff] [blame^]

392

393

if not tensor.needs_linear_format:

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

394

tens.set_format(TensorFormat.NHCWB16, arch)

Patrik Gustavsson

2021-01-21 08:28:55 +0100

[diff] [blame]

395

strides = tens.get_strides(shape4D=shape4D)

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

396

397

if tens.format == TensorFormat.NHCWB16:

Patrik Gustavsson

2021-01-21 08:28:55 +0100

[diff] [blame]

398

if strides[1] == block_size.depth:

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

399

burst_len = elem_size * block_size.depth * block_size.width

400

elif is_ifm:

401

burst_len = 16 * elem_size * block_size.width

402

else:

403

burst_len = 16 * elem_size * block_size.width * arch.ncores

404

else:

405

assert tens.format == TensorFormat.NHWC

406

if is_ifm:

Patrik Gustavsson

2021-01-21 08:28:55 +0100

[diff] [blame]

407

if strides[3] == block_size.depth:

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

408

burst_len = elem_size * block_size.depth * block_size.width

409

else:

410

burst_len = elem_size * block_size.depth

411

else:

Patrik Gustavsson

2021-01-21 08:28:55 +0100

[diff] [blame]

412

if block_size.depth <= 16 and strides[3] == block_size.depth:

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

413

burst_len = elem_size * block_size.depth * block_size.width

414

else:

415

burst_len = min(64, 16 * elem_size * arch.ncores, block_size.depth * elem_size)

416

Diqing Zhong

2020-12-11 13:07:37 +0100

[diff] [blame]

417

burst_len = min(arch.memory_burst_length[mem_area], burst_len)

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

418

bw = tens.bandwidth() if replace_bw is None else replace_bw

419

Diqing Zhong

2020-12-11 13:07:37 +0100

[diff] [blame]

420

return bw * (arch.memory_burst_length[mem_area] / burst_len)

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

421

422

Michael McGeagh

2020-12-03 15:21:36 +0000

[diff] [blame]

423

def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None, force_outputs_to_fast_storage=False):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

424

if block_config is None:

425

block_config = ps.block_config

426

bws = make_bandwidth_array()

Diqing Zhong

2020-12-08 13:08:48 +0100

[diff] [blame]

427

scaled_bws = make_bandwidth_array() # scaled bw with memory transfer efficiency

428

macs = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

429

cycles = make_cycles_array()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

430

ifm_read_multiple = 1

431

weight_read_multiple = 0

432

Michael McGeagh

2020-12-02 12:39:03 +0000

[diff] [blame]

433

if ps.placement in (PassPlacement.MemoryOnly, PassPlacement.StartupInit):

Diqing Zhong

2020-12-08 13:08:48 +0100

[diff] [blame]

434

return bws, macs, cycles, ifm_read_multiple, weight_read_multiple # nothing real happening in this pass

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

435

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

436

explicit_padding = (0, 0, 0, 0)

437

primary_op = ps.primary_op

438

replacement_read_bws = {}

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

439

ofm_block = Block(block_config[1], block_config[0], block_config[3])

440

ifm_block = Block(block_config[1], block_config[0], block_config[3])

441

Tim Hall

1bd531d

2020-11-01 20:59:36 +0000

[diff] [blame]

442

if ps.placement == PassPlacement.Npu and primary_op:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

443

explicit_padding = primary_op.attrs.get("explicit_padding", explicit_padding)

Louis Verhaard

aee5d75

2020-09-30 09:01:52 +0200

[diff] [blame]

444

assert primary_op.type.npu_block_type == ps.npu_block_type

445

npu_block_type = primary_op.type.npu_block_type

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

446

447

ifm_tensor, _, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()

Tim Hall

73e843f

2021-02-04 22:47:46 +0000

[diff] [blame]

448

ifm_tensor_shape = ps.primary_op.ifm_shapes[0]

449

ofm_tensor_shape = ps.primary_op.ofm_shapes[0]

Diqing Zhong

016b827

2020-12-16 16:46:06 +0100

[diff] [blame]

450

ofm_block.width = min(ofm_block.width, ofm_tensor_shape.width)

451

ofm_block.height = min(ofm_block.height, ofm_tensor_shape.height)

452

ofm_block.depth = min(ofm_block.depth, ofm_tensor_shape.depth)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

453

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

454

if npu_block_type == NpuBlockType.ReduceSum:

455

block_traversal = TensorBlockTraversal.DepthFirst

456

elif npu_block_type in (

457

NpuBlockType.ConvolutionMxN,

458

NpuBlockType.ConvolutionDepthWise,

459

NpuBlockType.VectorProduct,

460

):

461

block_traversal = weight_tensor.block_traversal

462

else:

463

block_traversal = TensorBlockTraversal.Default

464

ifm_block_depth = get_ifm_block_depth(

patrik.gustavsson

2020-12-21 17:10:40 +0000

[diff] [blame]

465

npu_block_type, ifm_tensor_shape.depth, ifm_tensor.dtype.size_in_bits(), block_traversal, ofm_block.depth

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

466

)

467

ifm_block = arch.get_ifm_block_size(

468

ifm_block_depth, ofm_block, primary_op.kernel, ifm_resampling_mode=ifm_tensor.resampling_mode

469

)

Diqing Zhong

016b827

2020-12-16 16:46:06 +0100

[diff] [blame]

470

ifm_block.width = min(ifm_block.width, ifm_tensor_shape.width)

471

ifm_block.height = min(ifm_block.height, ifm_tensor_shape.height)

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

472

Michael McGeagh

2020-12-02 12:39:03 +0000

[diff] [blame]

473

if npu_block_type in (

474

NpuBlockType.ConvolutionMxN,

475

NpuBlockType.ConvolutionDepthWise,

Diqing Zhong

2020-12-08 13:08:48 +0100

[diff] [blame]

476

NpuBlockType.VectorProduct,

Michael McGeagh

2020-12-02 12:39:03 +0000

[diff] [blame]

477

NpuBlockType.Pooling,

478

NpuBlockType.ReduceSum,

Tim Hall

c30f495

2020-06-15 20:47:35 +0100

[diff] [blame]

479

):

Charles Xu

3e9c434

2020-04-22 08:31:43 +0200

[diff] [blame]

480

# extent the ifm to full dimension

patrik.gustavsson

2020-12-21 17:10:40 +0000

[diff] [blame]

481

482

batch_size = ifm_tensor_shape.batch

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

483

Tim Hall

73e843f

2021-02-04 22:47:46 +0000

[diff] [blame]

484

# add in padding, height += top and bottom, width += left and right

485

ifm_tensor_shape = ifm_tensor_shape.add(

486

0, explicit_padding[0] + explicit_padding[2], explicit_padding[1] + explicit_padding[3], 0

487

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

488

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

489

if npu_block_type != NpuBlockType.Pooling:

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

490

if npu_block_type == NpuBlockType.ReduceSum:

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

491

weight_tensor_shape = [1, 1, ifm_tensor.shape[3], ofm_tensor.shape[3]]

492

weight_tensor_bandwidth_shape = [0] * 4

493

weight_tensor_element_size = 0

494

weight_tensor_bandwidth_compression_scale = 0.0

495

else:

Diqing Zhong

2020-12-08 13:08:48 +0100

[diff] [blame]

496

# For Vector product, weight format of IO is extended to HWIO, with H=W=1

497

weight_tensor_shape = numeric_util.full_shape(4, weight_tensor.shape, 1)

498

weight_tensor_bandwidth_shape = numeric_util.full_shape(4, weight_tensor.bandwidth_shape, 1)

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

499

weight_tensor_element_size = weight_tensor.element_size()

500

weight_tensor_bandwidth_compression_scale = weight_tensor.bandwidth_compression_scale

Diqing Zhong

2020-12-08 13:08:48 +0100

[diff] [blame]

501

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

502

nn_ops = (

patrik.gustavsson

2020-12-21 17:10:40 +0000

[diff] [blame]

503

int(ofm_tensor_shape.batch)

504

* int(ofm_tensor_shape.height)

505

* int(ofm_tensor_shape.width)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

506

* int(weight_tensor_shape[0])

507

* int(weight_tensor_shape[1])

508

* int(weight_tensor_shape[2])

509

* int(weight_tensor_shape[3])

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

510

)

511

else:

512

weight_tensor_shape = [

Dwight Lidman

4f728c0

2020-12-17 15:14:45 +0100

[diff] [blame]

513

*primary_op.get_kernel_size(),

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

514

1,

patrik.gustavsson

2020-12-21 17:10:40 +0000

[diff] [blame]

515

ifm_tensor_shape.depth,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

516

]

517

weight_tensor_bandwidth_shape = weight_tensor_shape

518

weight_tensor_element_size = 0

519

weight_tensor_bandwidth_compression_scale = 0.0

520

nn_ops = 0 # pooling doesn't count as NN ops

521

522

kernel_dims = weight_tensor_shape[:2]

523

524

sub_kernel_limits = arch.sub_kernel_limits[npu_block_type]

525

# count the sub kernels; the IFM block needs to be refetched for each of them

526

n_sub_kernels_y = numeric_util.round_up_divide(kernel_dims[0], sub_kernel_limits[0])

527

n_sub_kernels_x = numeric_util.round_up_divide(kernel_dims[1], sub_kernel_limits[1])

528

n_sub_kernels = n_sub_kernels_y * n_sub_kernels_x

529

Diqing Zhong

2020-12-08 13:08:48 +0100

[diff] [blame]

530

n_full_depth_stages = numeric_util.round_up_divide(weight_tensor_bandwidth_shape[3], ofm_block.depth)

531

if npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):

532

n_full_depth_stages = 1 # force to no reread

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

533

Diqing Zhong

2020-12-08 13:08:48 +0100

[diff] [blame]

534

ifm_read_multiple = n_sub_kernels * n_full_depth_stages

535

replacement_read_bws[ifm_tensor] = ifm_tensor.bandwidth() * ifm_read_multiple

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

536

Diqing Zhong

2020-12-08 13:08:48 +0100

[diff] [blame]

537

weight_read_multiple = numeric_util.round_up_divide(

patrik.gustavsson

2020-12-21 17:10:40 +0000

[diff] [blame]

538

ofm_tensor_shape.height, ofm_block.height

539

) * numeric_util.round_up_divide(ofm_tensor_shape.width, ofm_block.width)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

540

replacement_read_bws[weight_tensor] = (

541

batch_size

542

* shape_num_elements(weight_tensor_bandwidth_shape)

543

* weight_tensor_element_size

544

* weight_tensor_bandwidth_compression_scale

Diqing Zhong

2020-12-08 13:08:48 +0100

[diff] [blame]

545

* weight_read_multiple

546

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

547

Diqing Zhong

2020-12-08 13:08:48 +0100

[diff] [blame]

548

macs += nn_ops

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

549

cycles[PassCycles.Npu] = estimate_conv_pooling_cycles(

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

550

arch,

551

npu_block_type,

552

primary_op,

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

553

ifm_block,

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

ofm_block,

block_traversal,

kernel_dims,

ifm_tensor,

ofm_tensor,

ps.scale_tensor,

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

560

)

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

561

elif npu_block_type == NpuBlockType.ElementWise:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

562

# Work out how many elements we have and calculate performance.

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

563

cycles[PassCycles.Npu] = estimate_output_cycles(

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

arch,

npu_block_type,

primary_op,

ofm_tensor.elements(),

ps.ifm_tensor,

ps.ofm_tensor,

None,

ps.ifm2_tensor,

ofm_block,

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

573

)

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

574

575

prev_npu_pass = next((npu_ps for npu_ps in ps.dag_predecessors if npu_ps.placement is PassPlacement.Npu), None)

576

if prev_npu_pass is None:

577

# cycles for DMA ops in first pass

578

dma_ops = (op for op in ps.ops if op.type == Op.DMA)

579

for dma_op in dma_ops:

580

mem_area = dma_op.attrs["source"]

581

for tens in dma_op.inputs:

582

cycles[PassCycles.Npu] += tens.storage_size() / arch.memory_bandwidths_per_cycle[mem_area]

583

Michael McGeagh

2020-12-03 15:21:36 +0000

[diff] [blame]

584

if rewrite_list is not None:

585

# apply the desired rewrites

586

for rewrite_op, tens, _, _, _, ps_to_rewrite in rewrite_list:

587

if ps != ps_to_rewrite:

588

continue

589

if rewrite_op == SchedulerRewrite.Nop:

590

pass # these are fine, no bandwidth changes

591

elif rewrite_op in (SchedulerRewrite.ChangeTensorSubPurpose,):

Diqing Zhong

2020-12-08 13:08:48 +0100

[diff] [blame]

592

bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Read] += replacement_read_bws[tens]

Michael McGeagh

2020-12-03 15:21:36 +0000

[diff] [blame]

593

if tens.purpose == TensorPurpose.FeatureMap:

Diqing Zhong

2020-12-08 13:08:48 +0100

[diff] [blame]

594

scaled_bw = estimate_memory_transfer_efficiency(

Michael McGeagh

2020-12-03 15:21:36 +0000

[diff] [blame]

595

arch,

596

arch.fast_storage_mem_area,

597

BandwidthDirection.Read,

598

tens,

599

ifm_block,

600

replacement_read_bws[tens],

601

)

602

else:

Diqing Zhong

2020-12-08 13:08:48 +0100

[diff] [blame]

603

scaled_bw = replacement_read_bws[tens]

604

scaled_bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Read] += scaled_bw

Michael McGeagh

2020-12-03 15:21:36 +0000

[diff] [blame]

605

replacement_read_bws[tens] = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

606

607

for tens in ps.outputs:

608

if force_outputs_to_fast_storage:

Diqing Zhong

2020-12-08 13:08:48 +0100

[diff] [blame]

609

bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()

610

scaled_bws[arch.fast_storage_mem_area][tens.purpose][

611

BandwidthDirection.Write

612

] += estimate_memory_transfer_efficiency(

Patrik Gustavsson

2021-01-21 08:28:55 +0100

[diff] [blame]

613

arch, arch.fast_storage_mem_area, BandwidthDirection.Write, tens, ofm_block, shape4D=ps.ofm_shapes[0],

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

614

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

615

else:

Diqing Zhong

2020-12-08 13:08:48 +0100

[diff] [blame]

616

bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()

617

scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += estimate_memory_transfer_efficiency(

Patrik Gustavsson

2021-01-21 08:28:55 +0100

[diff] [blame]

618

arch, tens.mem_area, BandwidthDirection.Write, tens, ofm_block, shape4D=ps.ofm_shapes[0]

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

619

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

620

621

for tens in ps.intermediates:

622

bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()

Diqing Zhong

2020-12-08 13:08:48 +0100

[diff] [blame]

623

scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

624

625

if tens in replacement_read_bws:

626

bw = replacement_read_bws[tens]

627

else:

628

bw = tens.bandwidth()

629

630

bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw

Diqing Zhong

2020-12-08 13:08:48 +0100

[diff] [blame]

631

scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

632

633

for tens in ps.inputs:

Diqing Zhong

2020-12-08 13:08:48 +0100

[diff] [blame]

634

if tens in replacement_read_bws:

635

bw = replacement_read_bws[tens]

636

else:

637

bw = tens.bandwidth()

638

639

bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw

Patrik Gustavsson

2021-01-21 08:28:55 +0100

[diff] [blame]

640

641

op_shape = None

642

if ps.placement == PassPlacement.Npu and primary_op:

643

if tens == ps.ifm_tensor:

644

op_shape = ps.ifm_shapes[0]

645

elif tens == ps.ifm2_tensor:

646

op_shape = ps.ifm_shapes[1]

647

Diqing Zhong

2020-12-08 13:08:48 +0100

[diff] [blame]

648

scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += estimate_memory_transfer_efficiency(

Patrik Gustavsson

2021-01-21 08:28:55 +0100

[diff] [blame]

649

arch, tens.mem_area, BandwidthDirection.Read, tens, ifm_block, bw, op_shape

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

650

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

651

652

# quick build access counts for only current pass, even though these aren't the final numbers

Diqing Zhong

2020-12-08 13:08:48 +0100

[diff] [blame]

653

update_summary_cycles(arch, scaled_bws, cycles)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

654

Diqing Zhong

2020-12-08 13:08:48 +0100

[diff] [blame]

655

return bws, macs, cycles, ifm_read_multiple, weight_read_multiple

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

656

657

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

658

def update_summary_cycles(arch, bws, cycles):

659

cycles[PassCycles.SramAccess] = np.sum(bws[MemArea.Sram]) / arch.memory_bandwidths_per_cycle[MemArea.Sram]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

660

cycles[PassCycles.DramAccess] = np.sum(bws[MemArea.Dram]) / arch.memory_bandwidths_per_cycle[MemArea.Dram]

661

cycles[PassCycles.OnChipFlashAccess] = (

662

np.sum(bws[MemArea.OnChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OnChipFlash]

663

)

664

cycles[PassCycles.OffChipFlashAccess] = (

665

np.sum(bws[MemArea.OffChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OffChipFlash]

666

)

667

668

cycles[PassCycles.Total] = np.max(cycles[: PassCycles.Total])

return cycles

def collate_stats_for_cascaded_pass(arch, bws, macs, cycles):

673

return bws, macs, cycles

674

675

676

def performance_for_cascaded_pass(arch, cps):

677

total_bws = make_bandwidth_array()

Diqing Zhong

2020-12-08 13:08:48 +0100

[diff] [blame]

678

total_macs = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

679

total_cycles = make_cycles_array()

680

681

for ps in cps.passes:

Diqing Zhong

2020-12-08 13:08:48 +0100

[diff] [blame]

682

bws, macs, cycles, _, _ = performance_metrics_for_pass(arch, ps)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

683

ps.bandwidths = bws

684

ps.macs = macs

685

ps.cycles = cycles

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

686

total_bws += bws

687

total_macs += macs

688

total_cycles += cycles

689

690

bws, macs, cycles = collate_stats_for_cascaded_pass(arch, total_bws, total_macs, total_cycles)

cps.bandwidths = bws

cps.macs = macs

cps.cycles = cycles

return bws, macs, cycles

695

696

697

def calc_performance_for_network(nng, arch):

698

total_bws = make_bandwidth_array()

Diqing Zhong

2020-12-08 13:08:48 +0100

[diff] [blame]

699

total_macs = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

700

total_cycles = np.zeros(PassCycles.Size)

701

702

for sg in nng.subgraphs:

703

for cps in sg.cascaded_passes:

704

bws, macs, cycles = performance_for_cascaded_pass(arch, cps)

705

total_bws += bws

706

total_macs += macs

707

total_cycles += cycles

Tim Hall