Blame - ethosu/vela/npu_performance.py - ml/ethos-u/ethos-u-vela

2023-02-02 09:07:48 +0100

[diff] [blame^]

1

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Rickard Bolin

bc6ee58

2022-11-04 08:24:29 +0000

[diff] [blame]

16

#

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

17

# Description:

18

# NPU performance estimation functions to estimate performance of a Pass and CascadedPass. Uses a model that takes the

19

# maximum of the 'cycles required for bandwidth' and 'cycles required for computing'.

20

#

21

# Called during scheduling to evaluate different proposals, as well as post-scheduling to provide a final performance

22

# estimate.

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

23

import copy

wilisa01

2022-08-22 16:13:06 +0000

[diff] [blame]

24

import csv

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

25

from enum import auto

26

from enum import IntEnum

Jonas Ohlsson

845e232

2022-03-01 12:39:55 +0100

[diff] [blame]

27

from typing import Optional

Ayaan Masood

2022-02-22 11:28:55 +0000

[diff] [blame]

28

from typing import Set

29

from uuid import UUID

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

30

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

31

import numpy as np

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

32

33

from . import numeric_util

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

34

from .architecture_allocator import ArchitectureBlockConfig

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

35

from .architecture_features import Accelerator

Tim Hall

2022-03-03 17:50:52 +0000

[diff] [blame]

36

from .architecture_features import ArchitectureFeatures

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

37

from .architecture_features import NpuBlockType

38

from .architecture_features import SHRAMElements

39

from .architecture_features import TensorFormat

Tim Hall

2022-03-03 17:50:52 +0000

[diff] [blame]

40

from .debug_database import DebugDatabase

Ayaan Masood

2022-02-22 11:28:55 +0000

[diff] [blame]

41

from .nn_graph import Graph

Tim Hall

2022-03-03 17:50:52 +0000

[diff] [blame]

42

from .nn_graph import NetworkType

43

from .nn_graph import PassPlacement

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

44

from .numeric_util import round_up

Johan Alfvén

f8e353b

2022-02-04 17:24:23 +0100

[diff] [blame]

45

from .numeric_util import round_up_to_int

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

46

from .operation import Kernel

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

47

from .operation import Op

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

48

from .scheduler import Schedule

49

from .scheduler import SchedulerOperation

Ayaan Masood

2022-02-22 11:28:55 +0000

[diff] [blame]

50

from .scheduler import SchedulerOpInfo

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

51

from .shape4d import Shape4D

Diqing Zhong

f842b69

2020-12-11 13:07:37 +0100

[diff] [blame]

52

from .tensor import BandwidthDirection

Diego Russo

e8a1045

2020-04-21 17:39:10 +0100

[diff] [blame]

53

from .tensor import MemArea

Diego Russo

e8a1045

2020-04-21 17:39:10 +0100

[diff] [blame]

54

from .tensor import TensorPurpose

Johan Alfvén

0f98de6

2022-05-15 14:54:51 +0200

[diff] [blame]

55

from .tensor import TensorSubPurpose

Tim Hall

2022-03-03 17:50:52 +0000

[diff] [blame]

56

from .tflite_mapping import optype_to_builtintype as tflite_optype_to_builtintype

57

from .tosa_mapping import optype_to_tosa_op_type as tosa_optype_to_tosa_op_type

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

58

from .weight_compressor import WeightKey

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

59

60

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

61

class PassCycles(IntEnum):

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

62

Npu = 0

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

63

SramAccess = auto()

64

DramAccess = auto()

65

OnChipFlashAccess = auto()

66

OffChipFlashAccess = auto()

67

Total = auto()

68

Size = auto()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

69

70

def display_name(self):

Jonas Ohlsson

d857507

2022-03-30 10:30:25 +0200

[diff] [blame]

return (

"NPU",

"SRAM Access",

"DRAM Access",

"On-chip Flash Access",

76

"Off-chip Flash Access",

77

"Total",

78

"Size",

79

)[self.value]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

80

81

def identifier_name(self):

Jonas Ohlsson

d857507

2022-03-30 10:30:25 +0200

[diff] [blame]

return (

"npu",

"sram_access",

"dram_access",

"on_chip_flash_access",

87

"off_chip_flash_access",

88

"total",

89

"size",

90

)[self.value]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

@staticmethod

def all():

return (

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

95

PassCycles.Npu,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

96

PassCycles.SramAccess,

97

PassCycles.DramAccess,

98

PassCycles.OnChipFlashAccess,

99

PassCycles.OffChipFlashAccess,

PassCycles.Total,

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

104

class PerformanceQuery:

105

def __init__(self, npu_block_type=0):

106

self.npu_block_type = npu_block_type

107

self.ifm_shape = Shape4D(0)

108

self.ifm_format = TensorFormat.NHWC

109

self.ifm_memory_area = MemArea.Unknown

110

self.ifm2_memory_area = MemArea.Unknown

111

self.ifm_bits = 0

112

self.ifm2_bits = 0

113

self.ifm2_shape = None

114

self.ifm2_format = TensorFormat.NHWC

115

self.ofm_shape = Shape4D(0)

116

self.ofm_format = TensorFormat.NHWC

117

self.ofm_memory_area = MemArea.Unknown

118

self.ofm_bits = 0

119

self.const_shape = Shape4D(0)

120

self.const_memory_area = MemArea.Unknown

121

self.kernel = Kernel(1, 1)

122

self.config = ArchitectureBlockConfig()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

123

124

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

class CycleCost:

def __init__(self):

self.op_macs = 0

self.op_cycles = 0

def __mul__(self, scale):

131

out = CycleCost()

132

out.op_macs = self.op_macs * scale

133

out.op_cycles = self.op_cycles * scale

134

return out

135

136

def __iadd__(self, rhs):

137

self.op_macs += rhs.op_macs

138

self.op_cycles += rhs.op_cycles

return self

def __str__(self):

return "macs = {}, cycles = {}".format(self.op_macs, self.op_cycles)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

143

144

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

145

class ElementAccess:

146

def __init__(self):

147

# List of ONLY element access counts, consumers

148

# need to scale these values by the correct bitwidths

149

# to calculated memory bandwidth

150

self.ifm_read = [0, 0] # ifm1, ifm2

151

self.ofm_write = 0

152

self.weights_refetch = 0

153

self.const_read = [0, 0] # weights, scales

154

155

def __mul__(self, scale):

156

out = ElementAccess()

157

out.ifm_read[0] = self.ifm_read[0] * scale

158

out.ifm_read[1] = self.ifm_read[1] * scale

159

out.ofm_write = self.ofm_write * scale

160

out.weights_refetch = self.weights_refetch * scale

161

out.const_read[0] = self.const_read[0] * scale

162

out.const_read[1] = self.const_read[1] * scale

163

return out

164

165

def __iadd__(self, rhs):

166

self.ifm_read[0] += rhs.ifm_read[0]

167

self.ifm_read[1] += rhs.ifm_read[1]

168

self.ofm_write += rhs.ofm_write

169

self.weights_refetch += rhs.weights_refetch

170

self.const_read[0] += rhs.const_read[0]

171

self.const_read[1] += rhs.const_read[1]

return self

def __str__(self):

return "ifm read = {}, ofm write = {}, const read={}".format(self.ifm_read, self.ofm_write, self.const_read)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

176

177

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

178

def _strides_for_shape(shape: Shape4D, format: TensorFormat, element_bits):

179

if format == TensorFormat.NHWC:

180

strides = [0, 0, 0, 0]

181

strides[3] = element_bits / 8 # +Z

182

strides[2] = (element_bits * shape.depth) // 8 # +X

183

strides[1] = (element_bits * shape.depth * shape.width) // 8 # +Y

184

strides[0] = (element_bits * shape.depth * shape.width * shape.height) // 8 # +N

185

elif format == TensorFormat.NHCWB16:

186

strides = [0, 0, 0, 0, 0]

187

strides[4] = element_bits / 8 # +Z

188

strides[3] = (element_bits * 16) / 8 # +X

189

strides[2] = (element_bits * 16 * shape.width) / 8 # +C

190

strides[1] = (element_bits * shape.width * shape.depth) / 8 # +Y

191

strides[0] = (element_bits * shape.width * shape.depth) / 8 # +N

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

192

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

193

return strides

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

194

195

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

196

def _estimate_memory_transfer_efficiency(

197

arch, is_read, mem_area, format: TensorFormat, element_bits, block_size, shape4D, to_transfer

Patrik Gustavsson

3a26920

2021-01-21 08:28:55 +0100

[diff] [blame]

198

):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

199

burst_len = 8

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

200

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

201

strides = _strides_for_shape(shape4D, format, element_bits)

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

202

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

203

if format == TensorFormat.NHCWB16:

204

if strides[2] == block_size.depth: # TODO is this check corrrect for non 8-bit

205

burst_len = element_bits * block_size.depth * block_size.width

206

elif is_read:

207

burst_len = 16 * element_bits * block_size.width

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

208

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

209

burst_len = 16 * element_bits * block_size.width * arch.ncores

210

elif format == TensorFormat.NHWC:

211

if is_read:

212

if strides[3] == block_size.depth:

213

burst_len = element_bits * block_size.depth * block_size.width

214

else:

215

burst_len = element_bits * block_size.depth

216

else:

217

if block_size.depth <= 16 and strides[3] == block_size.depth:

218

burst_len = element_bits * block_size.depth * block_size.width

219

else:

220

burst_len = min(64 * 8, 16 * element_bits * arch.ncores, block_size.depth * element_bits)

221

222

burst_len = burst_len // 8 # bits->bytes

223

burst_len = min(arch.memory_burst_length[mem_area], burst_len)

224

return to_transfer * (arch.memory_burst_length[mem_area] / burst_len)

225

226

227

def _estimate_minimum_memory_cycles(arch, query: PerformanceQuery):

228

# Input block HW transfer (only for elements present)

229

ifm_bytes = Shape4D.min(query.ifm_shape, query.config.ifm_block).elements()

230

cycles_ifm_blk = arch.memory_latency[query.ifm_memory_area][BandwidthDirection.Read]

231

cycles_ifm_blk = cycles_ifm_blk + (

232

_estimate_memory_transfer_efficiency(

233

arch,

234

True,

235

query.ifm_memory_area,

236

query.ifm_format,

237

query.ifm_bits,

238

query.config.ifm_block,

query.ifm_shape,

ifm_bytes,

)

/ arch.memory_bandwidths_per_cycle[query.ifm_memory_area]

243

)

244

# Output block HW transfer (only for elements present)

245

ofm_bytes = Shape4D.min(query.ofm_shape, query.config.ofm_block).elements()

246

cycles_ofm_blk = arch.memory_latency[query.ofm_memory_area][BandwidthDirection.Write]

247

cycles_ofm_blk = cycles_ofm_blk + (

248

_estimate_memory_transfer_efficiency(

249

arch,

250

False,

251

query.ofm_memory_area,

252

query.ofm_format,

253

query.ofm_bits,

254

query.config.ofm_block,

query.ofm_shape,

ofm_bytes,

)

/ arch.memory_bandwidths_per_cycle[query.ofm_memory_area]

259

)

260

return cycles_ifm_blk, cycles_ofm_blk

261

262

263

def _estimate_output_cycles_per_element(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):

264

if query.npu_block_type == NpuBlockType.ElementWise and query.ifm_bits == 32:

265

# Unary op else Binary op

266

output_perf_index = 0 if query.ifm2_shape is not None else 1

267

elif op_type == Op.Mul and query.ofm_bits == 32:

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

268

output_perf_index = 2

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

269

elif op_type == Op.Mul or (

270

query.npu_block_type

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

271

in (

272

NpuBlockType.ConvolutionMxN,

273

NpuBlockType.ConvolutionDepthWise,

274

NpuBlockType.Pooling,

275

NpuBlockType.ReduceSum,

276

NpuBlockType.VectorProduct,

277

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

278

and query.config.acc_type == SHRAMElements.Acc40

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

279

):

280

output_perf_index = 3

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

281

elif op_type in (Op.Add, Op.Sub):

282

if False:

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

283

# Simple Add/Sub

284

output_perf_index = 4

285

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

286

# Advanced Add/Sub TODO: Add as perf selection as operator variant

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

287

output_perf_index = 5

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

288

elif op_type.is_maxpool_op():

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

289

output_perf_index = 6

290

else:

291

output_perf_index = 7

292

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

293

if faf_type in (Op.Sigmoid, Op.Tanh, Op.LUT):

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

294

activation_perf_index = 0

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

295

elif faf_type in (Op.Relu, Op.Relu6, Op.ReluN1To1):

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

296

activation_perf_index = 1

297

else:

298

activation_perf_index = 2

299

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

300

cycle_per_elem = max(

301

arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index]

302

)

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

303

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

304

if op_type.is_elementwise_op():

305

num_elems_blk = query.config.ofm_block.elements()

306

ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query)

307

cycle_cmd = ifm_blk_cycles + ofm_blk_cycles

308

cycle_cmd = (cycle_cmd + cycle_per_elem * num_elems_blk) / 4 # per DPU

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

309

cycle_per_elem = max(cycle_per_elem, cycle_cmd / num_elems_blk)

310

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

311

return cycle_per_elem

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

312

313

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

314

def _estimate_conv_cycles(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):

315

ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block)

316

ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block)

Diqing Zhong

2020-10-13 11:42:37 +0200

[diff] [blame]

317

318

if (

319

arch.config.ofm_ublock.height == 2

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

320

and query.npu_block_type

Diqing Zhong

2020-10-13 11:42:37 +0200

[diff] [blame]

321

in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

322

and query.ofm_shape.height == 1

Diqing Zhong

2020-10-13 11:42:37 +0200

[diff] [blame]

323

# Optimisation only applies for even width tensors

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

324

and query.ofm_shape.width % 2 == 0

325

and query.kernel.height == 1

Diqing Zhong

2020-10-13 11:42:37 +0200

[diff] [blame]

326

):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

327

ofm_ublock = Shape4D(1, 1, 4, arch.config.ofm_ublock.depth)

328

ofm_block = ofm_block.with_height(1)

329

else:

330

ofm_ublock = Shape4D(arch.config.ofm_ublock.to_hwc())

Diqing Zhong

2020-10-13 11:42:37 +0200

[diff] [blame]

331

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

332

num_ublk_x = numeric_util.round_up_divide(ofm_block.width, ofm_ublock.width)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

333

num_ublk_y = numeric_util.round_up_divide(ofm_block.height, ofm_ublock.height)

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

334

num_ublk_xy = num_ublk_x * num_ublk_y

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

335

num_ublk_z = numeric_util.round_up_divide(ofm_block.depth, ofm_ublock.depth)

336

use_acc_40bits = query.config.acc_type == SHRAMElements.Acc40

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

337

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

338

sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type]

339

n_sub_kernels_y = numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[0])

340

n_sub_kernels_x = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[1])

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

341

sub_kernel_x = [

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

342

min((query.kernel.width - i * sub_kernel_limits[1]), sub_kernel_limits[1]) for i in range(n_sub_kernels_x)

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

343

]

344

sub_kernel_y = [

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

345

min((query.kernel.height - i * sub_kernel_limits[0]), sub_kernel_limits[0]) for i in range(n_sub_kernels_y)

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

346

]

347

sub_kernel_size = (x * y for y in sub_kernel_y for x in sub_kernel_x)

348

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

349

cycles_dpu_blk = 0

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

350

cycles_wb = 32 * ofm_ublock.depth // 8

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

351

352

for num_kernel_elems in sub_kernel_size:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

353

if query.npu_block_type == NpuBlockType.Pooling:

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

354

num_kernel_steps = 1

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

355

cycles = max(4, num_kernel_elems) * num_ublk_xy * num_ublk_z

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

356

if query.ifm_bits == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32:

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

357

cycles *= 2

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

358

elif query.npu_block_type == NpuBlockType.ConvolutionDepthWise:

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

359

cycles = 4 * num_ublk_xy

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

360

if query.ifm_bits == 16:

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

361

cycles *= 2

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

362

num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, 4)

363

cycles = max(cycles_wb, cycles) * num_kernel_steps * num_ublk_z

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

364

elif (

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

365

(query.npu_block_type == NpuBlockType.ConvolutionMxN and not query.config.is_partkernel)

366

or query.npu_block_type == NpuBlockType.VectorProduct

367

or query.npu_block_type == NpuBlockType.ReduceSum

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

368

):

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

369

num_kernel_steps = num_kernel_elems

370

cycles = max(cycles_wb, 4 * num_ublk_xy) * num_kernel_steps * num_ublk_z

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

371

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

372

assert query.config.is_partkernel

373

divider = 2 if query.ifm_bits == 16 else 4

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

374

num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, divider)

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

375

cycles = max(cycles_wb, 4 * num_ublk_xy) * (

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

376

num_kernel_steps * numeric_util.round_up_divide(ifm_block.depth, 8) * num_ublk_z

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

377

)

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

378

379

delay_cycles = 0

380

if arch.accelerator_config is Accelerator.Ethos_U55_32:

381

delay = 7 if use_acc_40bits else 3

382

if num_ublk_x == 1 and num_ublk_y == 1:

383

if num_ublk_z == 1:

384

delay_cycles = delay * num_kernel_steps

385

elif num_kernel_steps > 1:

386

delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z

387

if (num_ublk_x == 1 or num_ublk_y == 1) and num_ublk_z > 1 and use_acc_40bits:

388

delay_cycles += delay * num_ublk_z

389

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

390

if use_acc_40bits and arch.accelerator_config in (Accelerator.Ethos_U55_64, Accelerator.Ethos_U55_128):

delay = 3

else:

delay = 2

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

395

if num_ublk_x == 1 and num_ublk_y == 1:

396

if num_ublk_z == 1:

397

delay_cycles = delay * num_kernel_steps

398

elif num_kernel_steps > 1:

399

delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z

400

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

401

if query.npu_block_type == NpuBlockType.ConvolutionMxN and query.config.is_partkernel:

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

402

delay_cycles *= numeric_util.round_up_divide(ifm_block.depth, 8)

403

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

404

cycles_dpu_blk += cycles

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

405

cycles_dpu_blk += delay_cycles

406

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

407

if query.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):

408

cycles_dpu_blk *= numeric_util.round_up_divide(query.ifm_shape.depth, ifm_block.depth)

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

409

410

cycles_dpu_blk /= arch.ncores

411

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

412

# Estimate output cycles

413

num_ofm_blks = query.ofm_shape.div_round_up(ofm_block).elements()

Johan Alfvén

f8e353b

2022-02-04 17:24:23 +0100

[diff] [blame]

414

cycles_output_blk = round_up_to_int(

415

_estimate_output_cycles_per_element(arch, op_type, faf_type, query) * ofm_block.elements()

416

)

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

417

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

418

# Scale and bias tensor

419

if query.const_shape.depth > 0:

Diqing Zhong

f842b69

2020-12-11 13:07:37 +0100

[diff] [blame]

420

cycles_bias_blk = (

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

421

10 * ofm_block.depth * arch.memory_latency[query.const_memory_area][BandwidthDirection.Read] / 256

Diqing Zhong

f842b69

2020-12-11 13:07:37 +0100

[diff] [blame]

422

)

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

423

cycles_output_blk = max(cycles_output_blk, cycles_bias_blk)

424

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

425

ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query)

426

cycles_cmd = ifm_blk_cycles + ofm_blk_cycles

427

cycles_cmd = (cycles_cmd + cycles_output_blk + cycles_dpu_blk) / 4 # per DPU

428

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

429

cycles_dpu_blk = max(cycles_dpu_blk, cycles_cmd)

430

cycles_output_blk = max(cycles_output_blk, cycles_cmd)

431

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

432

if cycles_dpu_blk > cycles_output_blk:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

433

total_cycles = cycles_dpu_blk * num_ofm_blks + cycles_output_blk

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

434

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

435

total_cycles = cycles_output_blk * num_ofm_blks + cycles_dpu_blk

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

return total_cycles

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

440

def measure_mem2mem_cycles(arch, from_mem_area, to_mem_area, to_transfer):

441

from_cycles = to_transfer // arch.memory_bandwidths_per_cycle[from_mem_area]

Tim Hall

789e6f3

2021-06-17 17:02:31 +0100

[diff] [blame]

442

from_cycles += arch.memory_latency[from_mem_area][BandwidthDirection.Read]

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

443

to_cycles = to_transfer // arch.memory_bandwidths_per_cycle[to_mem_area]

444

return max(from_cycles, to_cycles)

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

445

Patrik Gustavsson

ee99bb1

2021-04-08 09:04:00 +0200

[diff] [blame]

446

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

447

def measure_cycle_cost(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):

448

cycles = CycleCost()

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

449

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

450

# Convolution/Vector product cycle calculation

451

if query.npu_block_type in (

452

NpuBlockType.ConvolutionMxN,

453

NpuBlockType.ConvolutionDepthWise,

454

NpuBlockType.VectorProduct,

455

NpuBlockType.Pooling,

456

NpuBlockType.ReduceSum,

457

):

458

# cycles.op_macs and cycles.op_cycles should both handle >32-bits

459

if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):

460

cycles.op_macs = int(query.kernel.elements_wh()) * 1 * int(query.ofm_shape.elements())

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

461

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

462

cycles.op_macs = (

463

int(query.kernel.elements_wh()) * int(query.ifm_shape.depth) * int(query.ofm_shape.elements())

464

)

465

466

cycles.op_cycles = int(_estimate_conv_cycles(arch, op_type, faf_type, query))

467

# Elementwise cycle calculation

468

elif query.npu_block_type == NpuBlockType.ElementWise:

469

cycles.op_macs = 0

Johan Alfvén

f8e353b

2022-02-04 17:24:23 +0100

[diff] [blame]

470

ofm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ofm_format]))

471

cycles.op_cycles = round_up_to_int(

472

_estimate_output_cycles_per_element(arch, op_type, faf_type, query)

473

* Shape4D.round_up(query.ofm_shape, ofm_rounding).elements()

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

474

)

Johan Alfven

2023-02-02 09:07:48 +0100

[diff] [blame^]

475

# DMA cycle calculation

476

elif query.npu_block_type == NpuBlockType.Dma:

477

# Return 0 since this is not an actual NPU op

478

cycles.op_cycles = 0

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

479

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

480

assert False

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

481

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

482

return cycles

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

483

484

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

485

def measure_element_access(arch, query: PerformanceQuery):

486

access = ElementAccess()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

487

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

488

ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block)

489

ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block)

490

ifm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ifm_format]))

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

491

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

492

# Number of ofm blocks in the overall output shape

493

ofm_blocks = query.ofm_shape.div_round_up(ofm_block)

494

ofm_block_depth = ofm_block.depth

495

if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):

496

ofm_blocks = ofm_blocks.with_depth(1)

497

ofm_block_depth = query.ifm_shape.depth

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

498

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

499

# Convolution & pooling

500

if query.npu_block_type in (

501

NpuBlockType.ConvolutionMxN,

502

NpuBlockType.ConvolutionDepthWise,

503

NpuBlockType.VectorProduct,

504

NpuBlockType.Pooling,

505

NpuBlockType.ReduceSum,

506

):

507

# Number of sub kernels

508

sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type]

509

subkernels = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[0])

510

subkernels *= numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[1])

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

511

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

512

ofm_block_count = ofm_blocks.elements()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

513

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

514

ifm_fetch = (

515

Shape4D.round_up(ifm_block, ifm_rounding).elements_wh()

516

* Shape4D.round_up(query.ifm_shape, ifm_rounding).depth

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

517

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

518

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

519

if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):

520

kernel_read = query.kernel.elements_wh() * 1 # force to no reread

521

else:

522

kernel_read = query.kernel.elements_wh() * query.ifm_shape.depth

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

523

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

524

weight_fetch = kernel_read * ofm_block_depth * ofm_block_count

525

526

access.ifm_read[0] = ifm_fetch * subkernels * ofm_block_count

527

528

if query.npu_block_type not in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):

529

access.const_read[0] = weight_fetch

530

access.const_read[1] = query.ofm_shape.depth # Scales & biases

531

access.weights_refetch = ofm_blocks.elements_wh()

532

# Elementwise

533

elif query.npu_block_type == NpuBlockType.ElementWise:

534

if query.ifm_shape.elements() == 1:

535

if query.ifm_bits > 8:

536

# ifm is a non 8-bit scalar

537

access.ifm_read[0] = Shape4D.round_up(query.ifm_shape, ifm_rounding).elements()

538

if query.ifm2_shape:

539

access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()

540

else:

541

access.ifm_read[0] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()

542

if query.ifm2_shape:

543

if query.ifm2_shape.elements() > 1:

544

access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()

545

elif query.ifm2_bits > 8:

546

# ifm2 is a non 8-bit scalar

547

access.ifm_read[1] = Shape4D.round_up(query.ifm2_shape, ifm_rounding).elements()

Johan Alfven

2023-02-02 09:07:48 +0100

[diff] [blame^]

548

# DMA

549

elif query.npu_block_type == NpuBlockType.Dma:

550

# Return empty access since this is not an actual NPU op

551

return access

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

# Unknown

else:

assert False

ofm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ofm_format]))

557

access.ofm_write = Shape4D.round_up(query.ofm_shape, ofm_rounding).elements()

return access

def measure_performance_cost(

562

arch, op_type: Op, faf_type: Op, query: PerformanceQuery, offset: Shape4D, sub_shape: Shape4D

563

):

564

assert (query.ofm_bits > 0) and (query.ifm_bits > 0)

565

assert query.ofm_shape.elements() != 0

566

567

# Default to start if no offset provided

568

if offset is None:

569

offset = Shape4D(0, 0, 0, 0)

570

571

# Default to entire area if no sub-shape provided

572

if sub_shape is None:

573

sub_shape = query.ofm_shape

574

else:

575

sub_shape = Shape4D.min(sub_shape, query.ofm_shape)

576

577

sub_query = copy.deepcopy(query)

578

sub_query.ofm_shape = query.ofm_shape.clip(offset, sub_shape)

579

580

access = ElementAccess()

581

cycles = CycleCost()

582

583

cycle_tmp = measure_cycle_cost(arch, op_type, faf_type, sub_query)

584

cycles += cycle_tmp

585

access = measure_element_access(arch, sub_query)

586

587

return access, cycles

588

589

590

def make_bandwidth_array():

591

return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size))

592

593

594

def make_cycles_array():

595

return np.zeros(PassCycles.Size)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

596

597

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

598

def update_summary_cycles(arch, bws, cycles):

599

cycles[PassCycles.SramAccess] = np.sum(bws[MemArea.Sram]) / arch.memory_bandwidths_per_cycle[MemArea.Sram]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

600

cycles[PassCycles.DramAccess] = np.sum(bws[MemArea.Dram]) / arch.memory_bandwidths_per_cycle[MemArea.Dram]

601

cycles[PassCycles.OnChipFlashAccess] = (

602

np.sum(bws[MemArea.OnChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OnChipFlash]

603

)

604

cycles[PassCycles.OffChipFlashAccess] = (

605

np.sum(bws[MemArea.OffChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OffChipFlash]

606

)

607

608

cycles[PassCycles.Total] = np.max(cycles[: PassCycles.Total])

return cycles

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

612

def estimate_full_op_performance(

Jonas Ohlsson

845e232

2022-03-01 12:39:55 +0100

[diff] [blame]

613

arch, schedule: Schedule, op: SchedulerOperation, prev_op: Optional[SchedulerOperation], block_config

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

614

):

615

cycles_a = make_cycles_array()

616

bws = make_bandwidth_array()

617

scaled_bws = make_bandwidth_array() # scaled bw with memory transfer efficiency

618

macs = 0

619

620

query = PerformanceQuery(op.op_type.npu_block_type)

621

query.ifm_shape = op.ifm.shape

622

query.ifm_format = op.ifm.format

623

query.ifm_memory_area = op.ifm.mem_area

624

query.ifm_bits = op.ifm.dtype.size_in_bits()

625

query.ifm2_shape = op.ifm2 and op.ifm2.shape

626

query.ifm2_format = op.ifm2 and op.ifm2.format

627

query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area

628

query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()

629

query.ofm_shape = op.ofm.shape

630

query.ofm_memory_area = op.ofm.mem_area

631

query.ofm_bits = op.ofm.dtype.size_in_bits()

632

query.ofm_format = op.ofm.format

633

query.kernel = op.kernel

634

query.config = block_config

635

636

cost = schedule.cost_map[op]

637

prev_cost = schedule.cost_map[prev_op] if prev_op else None

638

if op.parent_op.bias:

639

query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

640

if cost.buffered_weight_tensors:

641

query.const_memory_area = cost.buffered_weight_tensors[0].mem_area

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

642

else:

643

query.const_memory_area = cost.npu_weights_tensor.mem_area

644

645

cycles = measure_cycle_cost(arch, op.op_type, op.parent_op.activation and op.parent_op.activation.op_type, query)

646

cycles_a[PassCycles.Npu] = cycles.op_cycles

647

macs = cycles.op_macs

648

649

access = measure_element_access(arch, query)

650

651

# How many NPU cycles are available under the previously executing

652

# operator for performing buffered DMA transfers

653

slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0

654

655

# LUT Transfer

656

parent_op = op.parent_op

Johan Alfven

2023-02-02 09:07:48 +0100

[diff] [blame^]

657

dma_transfer_cycles = 0

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

658

if parent_op.activation_lut:

659

lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0]

660

src_tensor = lut_tensor.src_tensor

661

if src_tensor and lut_tensor.mem_area != src_tensor.mem_area:

662

bw = src_tensor.storage_size()

Johan Alfven

2023-02-02 09:07:48 +0100

[diff] [blame^]

663

dma_transfer_cycles += measure_mem2mem_cycles(arch, src_tensor.mem_area, lut_tensor.mem_area, bw)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

664

665

bws[src_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw

666

# LUT read from SHRAM TODO remove?

Ayaan Masood

d5cbef3

2022-02-22 15:56:35 +0000

[diff] [blame]

667

scaled_bws[lut_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

668

Johan Alfven

2023-02-02 09:07:48 +0100

[diff] [blame^]

669

# DMA Transfer

670

if parent_op.type == Op.Memcpy:

671

src_tensor = parent_op.ifm

672

dst_tensor = parent_op.ofm

673

if src_tensor.mem_area != dst_tensor.mem_area:

674

bw = src_tensor.storage_size()

675

dma_transfer_cycles += measure_mem2mem_cycles(arch, src_tensor.mem_area, dst_tensor.mem_area, bw)

676

bws[src_tensor.mem_area][src_tensor.purpose][BandwidthDirection.Read] += bw

677

bws[dst_tensor.mem_area][src_tensor.purpose][BandwidthDirection.Write] += bw

678

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

679

if cost.npu_weights_tensor and cost.buffered_weight_tensors:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

680

# DMA Weight Transfer

681

sz = 0

682

# Get the size of the first DMA

683

for core in range(0, arch.ncores):

684

key = WeightKey(core, 0)

685

if key in cost.npu_weights_tensor.encoded_ranges:

686

weight_range = cost.npu_weights_tensor.encoded_ranges[key]

687

sz += round_up(weight_range.total_bytes, 16)

688

689

total_sz = len(cost.npu_weights_tensor.buffer)

690

bws[cost.npu_weights_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Read] += total_sz

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

691

bws[cost.buffered_weight_tensors[0].mem_area][TensorPurpose.Weights][BandwidthDirection.Write] += total_sz

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

692

693

ws_first_transfer_cycles = measure_mem2mem_cycles(

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

694

arch, cost.npu_weights_tensor.mem_area, cost.buffered_weight_tensors[0].mem_area, sz

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

695

)

696

697

# Add cycles for Weight + Scale Transfer

Johan Alfvén

0f98de6

2022-05-15 14:54:51 +0200

[diff] [blame]

698

if cost.buffered_weight_tensors[0].sub_purpose == TensorSubPurpose.DoubleBuffer:

699

# Double buffer - weights can be fetched in parallel

700

cycles_a[PassCycles.Npu] = max(

701

cost.full_weight_transfer_cycles - slack_cycles + cost.slack_buffering_cycles,

702

cycles.op_cycles + max(ws_first_transfer_cycles - slack_cycles, 0),

703

)

704

else:

705

# Standard buffer - weights can not be fetched in parallel so weight transfer

706

# must be included in the result

707

cycles_a[PassCycles.Npu] = (

708

cycles.op_cycles + cost.full_weight_transfer_cycles - min(ws_first_transfer_cycles, slack_cycles)

709

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

710

Johan Alfven

2023-02-02 09:07:48 +0100

[diff] [blame^]

711

# Add cycles for LUT + mempcy op Transfer

712

cycles_a[PassCycles.Npu] += dma_transfer_cycles

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

713

else:

Johan Alfven

2023-02-02 09:07:48 +0100

[diff] [blame^]

714

# Add cycles for LUT + mempcy op Transfer

715

cycles_a[PassCycles.Npu] += max(dma_transfer_cycles - slack_cycles, 0)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

716

717

# OFM write

718

ofm = op.parent_op.ofm

719

bw = access.ofm_write * ofm.element_size()

720

bws[query.ofm_memory_area][ofm.purpose][BandwidthDirection.Write] += bw

721

scaled_bws[ofm.mem_area][ofm.purpose][BandwidthDirection.Write] += _estimate_memory_transfer_efficiency(

722

arch, False, query.ofm_memory_area, ofm.format, query.ofm_bits, query.config.ofm_block, query.ofm_shape, bw

723

)

724

725

# IFM read

Johan Alfvén

2f87617

2022-12-07 12:40:55 +0100

[diff] [blame]

726

ifm = op.parent_op.ifm2 if op.reversed_operands else op.parent_op.ifm

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

727

bw = access.ifm_read[0] * ifm.element_size()

728

bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += bw

729

scaled_bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(

730

arch, True, query.ifm_memory_area, ifm.format, query.ifm_bits, query.config.ifm_block, query.ifm_shape, bw

731

)

Johan Alfvén

2f87617

2022-12-07 12:40:55 +0100

[diff] [blame]

732

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

733

if query.ifm2_shape:

Johan Alfvén

2f87617

2022-12-07 12:40:55 +0100

[diff] [blame]

734

ifm2 = op.parent_op.ifm if op.reversed_operands else op.parent_op.ifm2

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

735

bw = access.ifm_read[1] * ifm2.element_size()

736

bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += bw

737

scaled_bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(

738

arch,

739

True,

740

query.ifm2_memory_area,

741

ifm2.format,

742

op.ifm2.dtype.size_in_bits(),

743

query.config.ifm_block,

query.ifm2_shape,

bw,

)

# Weight read

if access.const_read[0] > 0:

750

# alignment not accounted for in bandwidth_compression_scale_approx

751

encoded_size_approx = (

752

cost.npu_weights_tensor.elements() - access.const_read[1] * op.parent_op.bias.element_size()

753

)

754

orig_weight_size = parent_op.weights.elements()

755

bandwidth_compression_scale_approx = encoded_size_approx / orig_weight_size

756

bw = access.const_read[0] * bandwidth_compression_scale_approx

757

bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw

758

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

759

if not cost.buffered_weight_tensors:

Patrik Gustavsson

225e19d

2021-06-01 12:43:43 +0200

[diff] [blame]

760

scaled_bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw

761

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

762

if access.const_read[1] > 0:

763

# Scales & biases

764

bw = access.const_read[1] * op.parent_op.bias.element_size()

765

bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw

766

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

767

if not cost.buffered_weight_tensors:

Patrik Gustavsson

225e19d

2021-06-01 12:43:43 +0200

[diff] [blame]

768

scaled_bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw

769

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

770

update_summary_cycles(arch, scaled_bws, cycles_a)

771

772

return bws, macs, cycles_a

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

773

774

Tim Hall

2022-03-03 17:50:52 +0000

[diff] [blame]

775

def print_performance(

776

nng: Graph,

777

arch: ArchitectureFeatures,

778

network_type: NetworkType,

bws: dict,

macs: dict,

cycles: dict,

mem_usage: dict,

wilisa01

2022-08-22 16:13:06 +0000

[diff] [blame]

783

output_basename: str,

Tim Hall

2022-03-03 17:50:52 +0000

[diff] [blame]

784

):

Tim Hall

2022-11-11 18:55:49 +0000

[diff] [blame]

785

def _percentage(part, whole):

786

# desired behaviour is for division by zero to return 100%

if whole == 0:

return 100.0

else:

return part / whole * 100.0

791

Tim Hall

2022-03-03 17:50:52 +0000

[diff] [blame]

792

if network_type == NetworkType.TFLite:

793

nng_optype_to_input_op_type = tflite_optype_to_builtintype

794

else:

795

nng_optype_to_input_op_type = tosa_optype_to_tosa_op_type

796

797

suid_inv_map = {v: k for k, v in DebugDatabase._sourceUID.items()}

798

Tim Hall

2022-11-11 18:55:49 +0000

[diff] [blame]

799

# the header is a list (one entry per column) of tuples (column name, alignment, width, precision)

800

header = [

801

(f"{network_type.name}_operator", "<", 20, -1),

802

("NNG Operator", "<", 20, -1),

803

("SRAM Usage", ">", 10, 0.0),

804

("Peak%", ">", 6, 0.2),

805

("Op Cycles", ">", 10, 0.0),

806

("Network%", ">", 8, 0.2),

807

("NPU", ">", 10, 0.0),

808

("SRAM AC", ">", 10, 0.0),

809

("DRAM AC", ">", 10, 0.0),

810

("OnFlash AC", ">", 10, 0.0),

811

("OffFlash AC", ">", 11, 0.0),

812

("MAC Count", ">", 10, 0.0),

813

("Network%", ">", 8, 0.2),

814

("Util%", ">", 6, 0.2),

815

("Name", "<", 20, -1),

]

# open the csv

csv_file = open(output_basename + "_per-layer.csv", "w", encoding="UTF8")

820

writer = csv.writer(csv_file)

821

Tim Hall

2022-03-03 17:50:52 +0000

[diff] [blame]

822

for sg in nng.subgraphs:

823

824

if sg.placement != PassPlacement.Npu:

825

continue

826

Tim Hall

2022-11-11 18:55:49 +0000

[diff] [blame]

827

sg_seperator_text = f"\n{str('#') * 80}\nPerformance for NPU Subgraph {sg.name}"

Tim Hall

2022-03-03 17:50:52 +0000

[diff] [blame]

828

Tim Hall

2022-11-11 18:55:49 +0000

[diff] [blame]

829

# the data is a list (one entry per op) of lists (matching the header columns)

830

data = []

831

for sched_op in sg.sched_ops:

832

# get source op name

833

sched_op_src_uid = DebugDatabase._optimisedUID[sched_op.parent_op][1]

834

if sched_op_src_uid == DebugDatabase.NULLREF:

835

src_op_type = None

836

else:

837

src_op_type = suid_inv_map[sched_op_src_uid].original_type

Tim Hall

2022-03-03 17:50:52 +0000

[diff] [blame]

838

Tim Hall

2022-11-11 18:55:49 +0000

[diff] [blame]

839

src_op_name = nng_optype_to_input_op_type(src_op_type)

Tim Hall

2022-03-03 17:50:52 +0000

[diff] [blame]

840

Tim Hall

2022-11-11 18:55:49 +0000

[diff] [blame]

841

max_macs = cycles[sched_op][PassCycles.Total] * arch.num_macs_per_cycle * arch.ncores

842

peak_sram = (

843

_percentage(mem_usage[sched_op], nng.memory_used[MemArea.Sram])

844

if MemArea.Sram in nng.memory_used

845

else 0

846

)

wilisa01

2022-08-22 16:13:06 +0000

[diff] [blame]

847

Tim Hall

2022-11-11 18:55:49 +0000

[diff] [blame]

data.append(

[

src_op_name,

sched_op.op_type,

mem_usage[sched_op],

peak_sram,

cycles[sched_op][PassCycles.Total],

855

_percentage(cycles[sched_op][PassCycles.Total], nng.cycles[PassCycles.Total]),

856

cycles[sched_op][PassCycles.Npu],

857

cycles[sched_op][PassCycles.SramAccess],

858

cycles[sched_op][PassCycles.DramAccess],

859

cycles[sched_op][PassCycles.OnChipFlashAccess],

860

cycles[sched_op][PassCycles.OffChipFlashAccess],

861

macs[sched_op],

862

_percentage(macs[sched_op], nng.macs),

863

_percentage(macs[sched_op], max_macs),

864

sched_op.name,

wilisa01

2022-08-22 16:13:06 +0000

[diff] [blame]

865

]

Tim Hall

2022-11-11 18:55:49 +0000

[diff] [blame]

)

# print to console

print(sg_seperator_text)

870

line = ""

871

line2 = ""

872

for col_name, align, width, _ in header:

873

line_data = f"{col_name:{align}{width}}"

874

line += line_data + " "

875

line2 += "-" * len(line_data) + " "

print(line)

print(line2)

for op_data in data:

line = ""

for idx, item in enumerate(op_data):

882

_, align, width, precision = header[idx]

if precision == -1:

w = str(width)

else:

w = str(width + precision) + "f"

887

line += f"{item:{align}{w}}" + " "

print(line)

# print to csv

writer.writerow((sg_seperator_text,))

892

writer.writerow(col_name for col_name, _, _, _ in header)

893

for op_data in data:

894

writer.writerow(op_data)

895

896

# close the csv

897

csv_file.close()

Tim Hall

2022-03-03 17:50:52 +0000

[diff] [blame]

898

899

wilisa01

2022-08-22 16:13:06 +0000

[diff] [blame]

900

def calc_new_performance_for_network(

901

nng: Graph,

902

arch,

903

network_type: NetworkType,

904

verbose_performance: bool,

905

output_basename: str = "output/unnamed_network",

906

):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

907

total_bws = make_bandwidth_array()

Diqing Zhong

69aadd0

2020-12-08 13:08:48 +0100

[diff] [blame]

908

total_macs = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

909

total_cycles = np.zeros(PassCycles.Size)

Ayaan Masood

2022-02-22 11:28:55 +0000

[diff] [blame]

910

total_weight_size = 0

911

total_encoded_weight_size = 0

912

913

# Store unique instances of original/encoded weight tensor uuids to prevent double counting of weights

914

original_weight_uuids: Set[UUID] = set()

915

encoded_npu_weight_uuids: Set[UUID] = set()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

916

Tim Hall

2022-03-03 17:50:52 +0000

[diff] [blame]

bws = {}

macs = {}

cycles = {}

mem_usage = {}

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

922

for sg in nng.subgraphs:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

923

prev_op = None

924

for sched_op in sg.sched_ops:

Ayaan Masood

2022-02-22 11:28:55 +0000

[diff] [blame]

925

op_info: SchedulerOpInfo = sg.schedule.cost_map[sched_op]

Tim Hall

2022-03-03 17:50:52 +0000

[diff] [blame]

926

bws[sched_op], macs[sched_op], cycles[sched_op] = estimate_full_op_performance(

927

arch, sg.schedule, sched_op, prev_op, op_info.block_config

)

# get op sram usage

mem_usage[sched_op] = (

932

sg.schedule.memory_snapshot[op_info.time_index]

933

if op_info.time_index < len(sg.schedule.memory_snapshot)

934

else 0

935

)

Ayaan Masood

2022-02-22 11:28:55 +0000

[diff] [blame]

936

937

# Tensors for calculating weight sizes

938

original_weight = sched_op.parent_op.weights

939

encoded_npu_weight = op_info.npu_weights_tensor

940

941

# Save UUIDs of original_weight so only unique instances of tensors are used to calculate weights

942

if original_weight and (original_weight.equivalence_id not in original_weight_uuids):

943

944

original_weight_uuids.add(original_weight.equivalence_id)

945

total_weight_size += original_weight.values.itemsize * original_weight.values.size

946

947

# Save UUIDs of encoded_npu_weight so only unique instances of tensors are used to calculate weights

948

if encoded_npu_weight and (encoded_npu_weight.equivalence_id not in encoded_npu_weight_uuids):

949

Jonas Ohlsson

77b448f

2022-03-11 16:08:30 +0100

[diff] [blame]

950

encoded_npu_weight_uuids.add(encoded_npu_weight.equivalence_id)

Ayaan Masood

2022-02-22 11:28:55 +0000

[diff] [blame]

951

total_encoded_weight_size += len(encoded_npu_weight.buffer)

952

Tim Hall

2022-03-03 17:50:52 +0000

[diff] [blame]

953

total_bws += bws[sched_op]

954

total_macs += macs[sched_op]

955

total_cycles += cycles[sched_op]

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

956

prev_op = sched_op

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

957

958

nng.bandwidths = total_bws

959

nng.macs = total_macs

960

nng.cycles = total_cycles

Ayaan Masood

2022-02-22 11:28:55 +0000

[diff] [blame]

961

nng.total_original_weights = total_weight_size

962

nng.total_npu_encoded_weights = total_encoded_weight_size

Tim Hall

2022-03-03 17:50:52 +0000

[diff] [blame]

963

964

if verbose_performance:

wilisa01