Blame - ethosu/vela/stats_writer.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

1

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

16

# Description:

17

# Writes out per-pass and summary performance statistics to CSV files.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

18

import csv

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

19

import sys

20

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

21

import numpy as np

22

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

23

from .nn_graph import PassPlacement

Diego Russo

e8a1045

2020-04-21 17:39:10 +0100

[diff] [blame]

24

from .npu_performance import BandwidthDirection

25

from .npu_performance import MacCount

26

from .npu_performance import PassCycles

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

27

from .numeric_util import round_up_to_int

Louis Verhaard

aee5d75

2020-09-30 09:01:52 +0200

[diff] [blame]

28

from .operation import Op

Diego Russo

e8a1045

2020-04-21 17:39:10 +0100

[diff] [blame]

29

from .tensor import MemArea

30

from .tensor import TensorPurpose

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

31

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

32

Louis Verhaard

2020-09-29 13:57:21 +0200

[diff] [blame]

33

def mem_areas_to_report():

34

# Exclude SHRAM, as the SHRAM performance numbers only cover LUT usage

35

return [area for area in MemArea.all() if area != MemArea.Shram]

36

37

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

38

def write_summary_metrics_csv(nng, summary_filename, arch):

39

with open(summary_filename, "w") as f:

40

writer = csv.writer(f)

Louis Verhaard

2020-09-29 13:57:21 +0200

[diff] [blame]

41

mem_areas = mem_areas_to_report()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

labels = [

"experiment",

"network",

]

labels += (

Tim Hall

2020-11-01 20:59:36 +0000

[diff] [blame]

49

["accelerator_configuration", "system_config", "memory_mode", "core_clock", "sram_size"]

Louis Verhaard

2020-09-29 13:57:21 +0200

[diff] [blame]

50

+ [area.identifier_name() + "_bandwidth" for area in mem_areas]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

51

+ ["weights_storage_area", "feature_map_storage_area"]

)

labels += [

"inferences_per_second",

56

"batch_size",

57

"inference_time",

58

"passes_before_fusing",

59

"passes_after_fusing",

60

]

Louis Verhaard

2020-09-29 13:57:21 +0200

[diff] [blame]

61

labels += [area.identifier_name() + "_memory_used" for area in mem_areas]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

62

labels += ["on_chip_flash_bits_per_element", "off_chip_flash_bits_per_element"]

63

Louis Verhaard

2020-09-29 13:57:21 +0200

[diff] [blame]

64

for mem_area in mem_areas:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

65

labels += [

66

mem_area.identifier_name() + "_feature_map_read_bytes",

67

mem_area.identifier_name() + "_feature_map_write_bytes",

68

mem_area.identifier_name() + "_weight_read_bytes",

69

mem_area.identifier_name() + "_weight_write_bytes",

70

mem_area.identifier_name() + "_total_bytes",

71

]

72

73

labels += ["nn_macs", "hardware_macs", "nn_tops", "hardware_tops"]

74

75

labels += ["cycles_" + kind.identifier_name() for kind in PassCycles.all()]

76

77

writer.writerow(labels)

data_items = [

"default",

nng.name,

]

if arch:

data_items += (

Tim Hall

2020-11-01 20:59:36 +0000

[diff] [blame]

86

[

87

arch.accelerator_config.name,

arch.system_config,

arch.memory_mode,

arch.core_clock,

arch.sram_size / 1024,

92

]

Louis Verhaard

2020-09-29 13:57:21 +0200

[diff] [blame]

93

+ [arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000 for mem_area in mem_areas]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

94

+ [

95

arch.tensor_storage_mem_area[TensorPurpose.Weights].display_name(),

96

arch.tensor_storage_mem_area[TensorPurpose.FeatureMap].display_name(),

]

)

Tim Hall

2020-11-01 20:59:36 +0000

[diff] [blame]

100

midpoint_inference_time = nng.cycles[PassCycles.Total] / arch.core_clock

Michael McGeagh

b424974

2020-07-30 14:36:40 +0100

[diff] [blame]

101

if midpoint_inference_time > 0:

102

midpoint_fps = 1 / midpoint_inference_time

103

else:

104

midpoint_fps = np.nan

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

105

106

n_passes = sum(len(sg.passes) for sg in nng.subgraphs)

107

n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)

108

109

data_items += [midpoint_fps, nng.batch_size, midpoint_inference_time, n_passes, n_cascaded_passes]

Louis Verhaard

2020-09-29 13:57:21 +0200

[diff] [blame]

110

data_items += [nng.memory_used.get(mem_area, 0) / 1024.0 for mem_area in mem_areas]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

111

112

data_items += [

113

nng.bits_per_element.get(MemArea.OnChipFlash, 0.0),

114

nng.bits_per_element.get(MemArea.OffChipFlash, 0.0),

115

]

116

Louis Verhaard

2020-09-29 13:57:21 +0200

[diff] [blame]

117

for mem_area in mem_areas:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

118

bws = nng.bandwidths[mem_area]

119

total_bw = np.sum(bws)

120

weight_bws = bws[TensorPurpose.Weights]

121

fm_bws = bws[TensorPurpose.FeatureMap]

122

data_items += [

123

fm_bws[BandwidthDirection.Read],

124

fm_bws[BandwidthDirection.Write],

125

weight_bws[BandwidthDirection.Read],

126

weight_bws[BandwidthDirection.Write],

total_bw,

]

data_items += [

nng.macs[MacCount.NeuralNetworkMacs],

132

nng.macs[MacCount.HardwareMacs],

133

nng.macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12,

134

nng.macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12,

135

]

136

137

data_items += [nng.cycles[kind] for kind in PassCycles.all()]

138

139

writer.writerow(data_items)

140

141

142

def write_pass_metrics_csv(nng, pass_filename):

143

144

with open(pass_filename, "w") as f:

145

writer = csv.writer(f)

146

147

purpose_list = (

148

("total", (TensorPurpose.Weights, TensorPurpose.FeatureMap)),

149

("weights", (TensorPurpose.Weights,)),

150

("feature_map", (TensorPurpose.FeatureMap,)),

)

direction_list = (

("total", (BandwidthDirection.Read, BandwidthDirection.Write)),

155

("read", (BandwidthDirection.Read,)),

156

("write", (BandwidthDirection.Write,)),

157

)

158

bandwidth_names = []

159

bandwidth_indices = []

Louis Verhaard

2020-09-29 13:57:21 +0200

[diff] [blame]

160

for mem_area in mem_areas_to_report():

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

161

for purpose, purpose_candidates in purpose_list:

162

for direction, direction_candidates in direction_list:

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

163

label = "bytes_{}_{}_{}".format(mem_area.identifier_name(), purpose, direction)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

164

bandwidth_names.append(label)

165

bandwidth_indices.append((mem_area, purpose_candidates, direction_candidates))

166

167

all_macs = MacCount.all()

168

all_cycles = (

169

PassCycles.Total,

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

170

PassCycles.Npu,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

171

PassCycles.SramAccess,

172

PassCycles.DramAccess,

173

PassCycles.OnChipFlashAccess,

174

PassCycles.OffChipFlashAccess,

)

writer.writerow(

[

"name",

"operators",

"placement",

"streaming_strategy",

182

"block_config_height",

183

"block_config_width",

184

"block_config_input_channels",

185

"block_config_output_channels",

186

"n_blocks_in_pass",

187

]

188

+ ["cycles_" + v.identifier_name() for v in all_cycles]

189

+ [v.identifier_name() for v in all_macs]

+ bandwidth_names

+ ["sram_used"]

)

def write_subgraph(sg):

195

for cps in sg.cascaded_passes:

196

if cps.placement == PassPlacement.StartupInit:

197

continue # skip the dummy init pass

198

199

for ps in cps.passes:

Louis Verhaard

aee5d75

2020-09-30 09:01:52 +0200

[diff] [blame]

200

if len(ps.ops) == 1 and ps.ops[0].type == Op.CustomNpuOp:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

201

# just treat this as a call, unroll it

202

write_subgraph(ps.ops[0].attrs["subgraph"])

203

continue

Louis Verhaard

aee5d75

2020-09-30 09:01:52 +0200

[diff] [blame]

204

stats = [ps.name, " ".join(op.type.name for op in ps.ops)]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

205

stats += [ps.placement.name]

206

stats += [cps.strategy.name]

207

stats += list(ps.block_config)

208

stats += [ps.n_blocks]

209

stats += [round_up_to_int(ps.cycles[v]) for v in all_cycles]

210

stats += [round_up_to_int(ps.macs[v]) for v in all_macs]

211

for indices in bandwidth_indices:

res = 0

i = indices[0]

for j in indices[1]:

for k in indices[2]:

res += round_up_to_int(ps.bandwidths[i, j, k])

217

stats.append(res)

Patrik Gustavsson

eca2e95

2020-05-27 09:15:11 +0200

[diff] [blame]

218

try:

219

stats += [ps.sram_used]

220

except AttributeError:

221

stats += [0]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

222

223

writer.writerow(stats)

224

225

write_subgraph(nng.get_root_subgraph())

226

227

228

def print_performance_metrics_for_strat(

arch,

name,

cycles,

macs,

bandwidths,

batch_size,

memory_used,

num_passes,

num_cascaded_passes,

n_operations=0,

Michael McGeagh

6f72526

2020-12-03 15:21:36 +0000

[diff] [blame^]

239

cpu_operations=None,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

240

bits_per_element=None,

241

show_cpu_operations=False,

f=sys.stdout,

):

Louis Verhaard

2020-09-29 13:57:21 +0200

[diff] [blame]

245

orig_mem_areas_labels = [(v, v.display_name()) for v in mem_areas_to_report()]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

246

Tim Hall

2020-11-01 20:59:36 +0000

[diff] [blame]

247

midpoint_inference_time = cycles[PassCycles.Total] / arch.core_clock

Michael McGeagh

b424974

2020-07-30 14:36:40 +0100

[diff] [blame]

248

if midpoint_inference_time > 0:

249

midpoint_fps = 1 / midpoint_inference_time

250

else:

251

midpoint_fps = np.nan

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

252

253

mem_area_labels = [

254

(mem_area, label) for mem_area, label in orig_mem_areas_labels if np.sum(bandwidths[mem_area]) > 0

]

if name:

print("", file=f)

print("Network summary for", name, file=f)

Tim Hall

2020-11-01 20:59:36 +0000

[diff] [blame]

260

print("Accelerator configuration {:>20}".format(arch.accelerator_config.name), file=f)

261

print("System configuration {:>20}".format(arch.system_config), file=f)

262

print("Memory mode {:>20}".format(arch.memory_mode), file=f)

263

print("Accelerator clock {:12d} MHz".format(int(arch.core_clock / 1e6)), file=f)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

264

for mem_area, label in mem_area_labels:

265

print(

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

266

"Design peak {:25} {:12.2f} GB/s".format(

267

label + " bandwidth", arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000

268

),

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

269

file=f,

270

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

271

print(file=f)

272

for mem_area, label in mem_area_labels:

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

273

if mem_area not in memory_used:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

274

continue

275

276

aug_label = label + " used"

277

278

extra = ""

279

if (mem_area == MemArea.OnChipFlash or mem_area == MemArea.OffChipFlash) and bits_per_element is not None:

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

280

extra = " ({:.2f} bits per element)".format(bits_per_element[mem_area])

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

281

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

282

print("Total {:25} {:12.2f} KiB{}".format(aug_label, memory_used[mem_area] / 1024.0, extra), file=f)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

283

284

print(file=f)

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

285

print("{:d} passes fused into {:d}".format(num_passes, num_cascaded_passes), file=f)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

286

Michael McGeagh

6f72526

2020-12-03 15:21:36 +0000

[diff] [blame^]

287

if cpu_operations is None:

288

cpu_operations = []

289

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

290

n_cpu_operations = len(cpu_operations)

291

if n_operations > 0:

292

print(

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

293

"{:d}/{:d} ({:4.1%}) operations falling back to the CPU".format(

294

n_cpu_operations, n_operations, n_cpu_operations / n_operations * 100

295

),

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

file=f,

)

if show_cpu_operations:

300

for op in cpu_operations:

301

302

def format_tens_list(lst):

303

return " ".join(str(list(tens.shape)) for tens in lst)

304

305

print(

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

306

"CPU operation: {} inputs {}, outputs {}".format(

307

op.type, format_tens_list(op.inputs), format_tens_list(op.outputs)

308

),

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

file=f,

)

print("", file=f)

for mem_area, label in mem_area_labels:

315

bws = bandwidths[mem_area]

316

total_bw = np.sum(bws)

317

weight_bws = bws[TensorPurpose.Weights]

318

fm_bws = bws[TensorPurpose.FeatureMap]

319

aug_label = label + " bandwidth"

320

print(

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

321

"Average {:25} {:12.2f} GB/s".format(aug_label, total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0),

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

322

file=f,

323

)

324

print(

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

325

"Input {:25} {:12.2f} MB/batch".format(

326

aug_label, np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0

327

),

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

328

file=f,

329

)

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

330

print("Weight {:25} {:12.2f} MB/batch".format(aug_label, np.sum(weight_bws) / 1000.0 / 1000.0), file=f)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

331

print(

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

332

"Output {:25} {:12.2f} MB/batch".format(

333

aug_label, np.sum(fm_bws[BandwidthDirection.Write]) / 1000.0 / 1000.0

334

),

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

335

file=f,

336

)

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

337

print("Total {:25} {:12.2f} MB/batch".format(aug_label, total_bw / 1000.0 / 1000.0), file=f)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

338

print(

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

339

"Total {:25} per input {:9.2f} MB/inference (batch size {:d})".format(

340

aug_label, total_bw / 1000.0 / 1000.0 / batch_size, batch_size

341

),

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

file=f,

)

print(file=f)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

346

print(

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

347

"Neural network macs {:12d} MACs/batch".format(int(macs[MacCount.NeuralNetworkMacs])),

348

file=f,

349

)

350

print("Hardware macs {:12d} MACs/batch".format(int(macs[MacCount.HardwareMacs])), file=f)

351

print(

352

"Network Tops/s {:12.2f} Tops/s".format(

353

macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12

354

),

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

355

file=f,

356

)

357

print(

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

358

"Hardware Tops/s {:12.2f} Tops/s".format(

359

macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12

360

),

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

file=f,

)

print(file=f)

for kind in PassCycles.all():

366

aug_label = kind.display_name() + " cycles"

367

cyc = cycles[kind]

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

368

print("{:30} {:12d} cycles/batch".format(aug_label, int(cyc)), file=f)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

369

print(file=f)

370

371

print(

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

372

"Batch Inference time {:7.2f} ms, {:7.2f} inferences/s (batch size {:d})".format(

373

midpoint_inference_time * 1000, midpoint_fps, batch_size

374

),

Tim Hall