Blame - ethosu/vela/npu_performance.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

16

# Description:

17

# NPU performance estimation functions to estimate performance of a Pass and CascadedPass. Uses a model that takes the

18

# maximum of the 'cycles required for bandwidth' and 'cycles required for computing'.

19

#

20

# Called during scheduling to evaluate different proposals, as well as post-scheduling to provide a final performance

21

# estimate.

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

22

import copy

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

23

from enum import auto

24

from enum import IntEnum

Ayaan Masood

2022-02-22 11:28:55 +0000

[diff] [blame]

25

from typing import Set

26

from uuid import UUID

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

27

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

28

import numpy as np

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

29

30

from . import numeric_util

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

31

from .architecture_allocator import ArchitectureBlockConfig

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

32

from .architecture_features import Accelerator

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

33

from .architecture_features import NpuBlockType

34

from .architecture_features import SHRAMElements

35

from .architecture_features import TensorFormat

Ayaan Masood

2022-02-22 11:28:55 +0000

[diff] [blame]

36

from .nn_graph import Graph

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

37

from .numeric_util import round_up

Johan Alfvén

f8e353b

2022-02-04 17:24:23 +0100

[diff] [blame^]

38

from .numeric_util import round_up_to_int

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

39

from .operation import Kernel

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

40

from .operation import Op

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

41

from .scheduler import Schedule

42

from .scheduler import SchedulerOperation

Ayaan Masood

2022-02-22 11:28:55 +0000

[diff] [blame]

43

from .scheduler import SchedulerOpInfo

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

44

from .shape4d import Shape4D

Diqing Zhong

f842b69

2020-12-11 13:07:37 +0100

[diff] [blame]

45

from .tensor import BandwidthDirection

Diego Russo

e8a1045

2020-04-21 17:39:10 +0100

[diff] [blame]

46

from .tensor import MemArea

Diego Russo

e8a1045

2020-04-21 17:39:10 +0100

[diff] [blame]

47

from .tensor import TensorPurpose

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

48

from .weight_compressor import WeightKey

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

49

50

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

51

class PassCycles(IntEnum):

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

52

Npu = 0

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

53

SramAccess = auto()

54

DramAccess = auto()

55

OnChipFlashAccess = auto()

56

OffChipFlashAccess = auto()

57

Total = auto()

58

Size = auto()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

59

60

def display_name(self):

Tim Hall

1bd531d

2020-11-01 20:59:36 +0000

[diff] [blame]

61

return ("NPU", "SRAM Access", "DRAM Access", "On-chip Flash Access", "Off-chip Flash Access", "Total", "Size",)[

62

self.value

63

]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

64

65

def identifier_name(self):

Tim Hall

1bd531d

2020-11-01 20:59:36 +0000

[diff] [blame]

66

return ("npu", "sram_access", "dram_access", "on_chip_flash_access", "off_chip_flash_access", "total", "size",)[

67

self.value

68

]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

@staticmethod

def all():

return (

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

73

PassCycles.Npu,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

74

PassCycles.SramAccess,

75

PassCycles.DramAccess,

76

PassCycles.OnChipFlashAccess,

77

PassCycles.OffChipFlashAccess,

PassCycles.Total,

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

82

class PerformanceQuery:

83

def __init__(self, npu_block_type=0):

84

self.npu_block_type = npu_block_type

85

self.ifm_shape = Shape4D(0)

86

self.ifm_format = TensorFormat.NHWC

87

self.ifm_memory_area = MemArea.Unknown

88

self.ifm2_memory_area = MemArea.Unknown

89

self.ifm_bits = 0

90

self.ifm2_bits = 0

91

self.ifm2_shape = None

92

self.ifm2_format = TensorFormat.NHWC

93

self.ofm_shape = Shape4D(0)

94

self.ofm_format = TensorFormat.NHWC

95

self.ofm_memory_area = MemArea.Unknown

96

self.ofm_bits = 0

97

self.const_shape = Shape4D(0)

98

self.const_memory_area = MemArea.Unknown

99

self.kernel = Kernel(1, 1)

100

self.config = ArchitectureBlockConfig()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

101

102

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

class CycleCost:

def __init__(self):

self.op_macs = 0

self.op_cycles = 0

def __mul__(self, scale):

109

out = CycleCost()

110

out.op_macs = self.op_macs * scale

111

out.op_cycles = self.op_cycles * scale

112

return out

113

114

def __iadd__(self, rhs):

115

self.op_macs += rhs.op_macs

116

self.op_cycles += rhs.op_cycles

return self

def __str__(self):

return "macs = {}, cycles = {}".format(self.op_macs, self.op_cycles)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

121

122

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

123

class ElementAccess:

124

def __init__(self):

125

# List of ONLY element access counts, consumers

126

# need to scale these values by the correct bitwidths

127

# to calculated memory bandwidth

128

self.ifm_read = [0, 0] # ifm1, ifm2

129

self.ofm_write = 0

130

self.weights_refetch = 0

131

self.const_read = [0, 0] # weights, scales

132

133

def __mul__(self, scale):

134

out = ElementAccess()

135

out.ifm_read[0] = self.ifm_read[0] * scale

136

out.ifm_read[1] = self.ifm_read[1] * scale

137

out.ofm_write = self.ofm_write * scale

138

out.weights_refetch = self.weights_refetch * scale

139

out.const_read[0] = self.const_read[0] * scale

140

out.const_read[1] = self.const_read[1] * scale

141

return out

142

143

def __iadd__(self, rhs):

144

self.ifm_read[0] += rhs.ifm_read[0]

145

self.ifm_read[1] += rhs.ifm_read[1]

146

self.ofm_write += rhs.ofm_write

147

self.weights_refetch += rhs.weights_refetch

148

self.const_read[0] += rhs.const_read[0]

149

self.const_read[1] += rhs.const_read[1]

return self

def __str__(self):

return "ifm read = {}, ofm write = {}, const read={}".format(self.ifm_read, self.ofm_write, self.const_read)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

154

155

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

156

def _strides_for_shape(shape: Shape4D, format: TensorFormat, element_bits):

157

if format == TensorFormat.NHWC:

158

strides = [0, 0, 0, 0]

159

strides[3] = element_bits / 8 # +Z

160

strides[2] = (element_bits * shape.depth) // 8 # +X

161

strides[1] = (element_bits * shape.depth * shape.width) // 8 # +Y

162

strides[0] = (element_bits * shape.depth * shape.width * shape.height) // 8 # +N

163

elif format == TensorFormat.NHCWB16:

164

strides = [0, 0, 0, 0, 0]

165

strides[4] = element_bits / 8 # +Z

166

strides[3] = (element_bits * 16) / 8 # +X

167

strides[2] = (element_bits * 16 * shape.width) / 8 # +C

168

strides[1] = (element_bits * shape.width * shape.depth) / 8 # +Y

169

strides[0] = (element_bits * shape.width * shape.depth) / 8 # +N

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

170

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

171

return strides

Diqing Zhong

2020-10-02 13:18:42 +0200

[diff] [blame]

172

173

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

174

def _estimate_memory_transfer_efficiency(

175

arch, is_read, mem_area, format: TensorFormat, element_bits, block_size, shape4D, to_transfer

Patrik Gustavsson

3a26920

2021-01-21 08:28:55 +0100

[diff] [blame]

176

):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

177

burst_len = 8

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

178

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

179

strides = _strides_for_shape(shape4D, format, element_bits)

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

180

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

181

if format == TensorFormat.NHCWB16:

182

if strides[2] == block_size.depth: # TODO is this check corrrect for non 8-bit

183

burst_len = element_bits * block_size.depth * block_size.width

184

elif is_read:

185

burst_len = 16 * element_bits * block_size.width

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

186

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

187

burst_len = 16 * element_bits * block_size.width * arch.ncores

188

elif format == TensorFormat.NHWC:

189

if is_read:

190

if strides[3] == block_size.depth:

191

burst_len = element_bits * block_size.depth * block_size.width

192

else:

193

burst_len = element_bits * block_size.depth

194

else:

195

if block_size.depth <= 16 and strides[3] == block_size.depth:

196

burst_len = element_bits * block_size.depth * block_size.width

197

else:

198

burst_len = min(64 * 8, 16 * element_bits * arch.ncores, block_size.depth * element_bits)

199

200

burst_len = burst_len // 8 # bits->bytes

201

burst_len = min(arch.memory_burst_length[mem_area], burst_len)

202

return to_transfer * (arch.memory_burst_length[mem_area] / burst_len)

203

204

205

def _estimate_minimum_memory_cycles(arch, query: PerformanceQuery):

206

# Input block HW transfer (only for elements present)

207

ifm_bytes = Shape4D.min(query.ifm_shape, query.config.ifm_block).elements()

208

cycles_ifm_blk = arch.memory_latency[query.ifm_memory_area][BandwidthDirection.Read]

209

cycles_ifm_blk = cycles_ifm_blk + (

210

_estimate_memory_transfer_efficiency(

211

arch,

212

True,

213

query.ifm_memory_area,

214

query.ifm_format,

215

query.ifm_bits,

216

query.config.ifm_block,

query.ifm_shape,

ifm_bytes,

)

/ arch.memory_bandwidths_per_cycle[query.ifm_memory_area]

221

)

222

# Output block HW transfer (only for elements present)

223

ofm_bytes = Shape4D.min(query.ofm_shape, query.config.ofm_block).elements()

224

cycles_ofm_blk = arch.memory_latency[query.ofm_memory_area][BandwidthDirection.Write]

225

cycles_ofm_blk = cycles_ofm_blk + (

226

_estimate_memory_transfer_efficiency(

227

arch,

228

False,

229

query.ofm_memory_area,

230

query.ofm_format,

231

query.ofm_bits,

232

query.config.ofm_block,

query.ofm_shape,

ofm_bytes,

)

/ arch.memory_bandwidths_per_cycle[query.ofm_memory_area]

237

)

238

return cycles_ifm_blk, cycles_ofm_blk

239

240

241

def _estimate_output_cycles_per_element(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):

242

if query.npu_block_type == NpuBlockType.ElementWise and query.ifm_bits == 32:

243

# Unary op else Binary op

244

output_perf_index = 0 if query.ifm2_shape is not None else 1

245

elif op_type == Op.Mul and query.ofm_bits == 32:

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

246

output_perf_index = 2

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

247

elif op_type == Op.Mul or (

248

query.npu_block_type

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

249

in (

250

NpuBlockType.ConvolutionMxN,

251

NpuBlockType.ConvolutionDepthWise,

252

NpuBlockType.Pooling,

253

NpuBlockType.ReduceSum,

254

NpuBlockType.VectorProduct,

255

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

256

and query.config.acc_type == SHRAMElements.Acc40

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

257

):

258

output_perf_index = 3

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

259

elif op_type in (Op.Add, Op.Sub):

260

if False:

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

261

# Simple Add/Sub

262

output_perf_index = 4

263

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

264

# Advanced Add/Sub TODO: Add as perf selection as operator variant

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

265

output_perf_index = 5

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

266

elif op_type.is_maxpool_op():

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

267

output_perf_index = 6

268

else:

269

output_perf_index = 7

270

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

271

if faf_type in (Op.Sigmoid, Op.Tanh, Op.LUT):

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

272

activation_perf_index = 0

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

273

elif faf_type in (Op.Relu, Op.Relu6, Op.ReluN1To1):

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

274

activation_perf_index = 1

275

else:

276

activation_perf_index = 2

277

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

278

cycle_per_elem = max(

279

arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index]

280

)

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

281

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

282

if op_type.is_elementwise_op():

283

num_elems_blk = query.config.ofm_block.elements()

284

ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query)

285

cycle_cmd = ifm_blk_cycles + ofm_blk_cycles

286

cycle_cmd = (cycle_cmd + cycle_per_elem * num_elems_blk) / 4 # per DPU

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

287

cycle_per_elem = max(cycle_per_elem, cycle_cmd / num_elems_blk)

288

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

289

return cycle_per_elem

Diqing Zhong

2020-09-24 09:53:48 +0200

[diff] [blame]

290

291

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

292

def _estimate_conv_cycles(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):

293

ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block)

294

ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block)

Diqing Zhong

2020-10-13 11:42:37 +0200

[diff] [blame]

295

296

if (

297

arch.config.ofm_ublock.height == 2

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

298

and query.npu_block_type

Diqing Zhong

2020-10-13 11:42:37 +0200

[diff] [blame]

299

in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

300

and query.ofm_shape.height == 1

Diqing Zhong

2020-10-13 11:42:37 +0200

[diff] [blame]

301

# Optimisation only applies for even width tensors

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

302

and query.ofm_shape.width % 2 == 0

303

and query.kernel.height == 1

Diqing Zhong

2020-10-13 11:42:37 +0200

[diff] [blame]

304

):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

305

ofm_ublock = Shape4D(1, 1, 4, arch.config.ofm_ublock.depth)

306

ofm_block = ofm_block.with_height(1)

307

else:

308

ofm_ublock = Shape4D(arch.config.ofm_ublock.to_hwc())

Diqing Zhong

2020-10-13 11:42:37 +0200

[diff] [blame]

309

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

310

num_ublk_x = numeric_util.round_up_divide(ofm_block.width, ofm_ublock.width)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

311

num_ublk_y = numeric_util.round_up_divide(ofm_block.height, ofm_ublock.height)

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

312

num_ublk_xy = num_ublk_x * num_ublk_y

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

313

num_ublk_z = numeric_util.round_up_divide(ofm_block.depth, ofm_ublock.depth)

314

use_acc_40bits = query.config.acc_type == SHRAMElements.Acc40

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

315

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

316

sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type]

317

n_sub_kernels_y = numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[0])

318

n_sub_kernels_x = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[1])

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

319

sub_kernel_x = [

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

320

min((query.kernel.width - i * sub_kernel_limits[1]), sub_kernel_limits[1]) for i in range(n_sub_kernels_x)

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

321

]

322

sub_kernel_y = [

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

323

min((query.kernel.height - i * sub_kernel_limits[0]), sub_kernel_limits[0]) for i in range(n_sub_kernels_y)

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

324

]

325

sub_kernel_size = (x * y for y in sub_kernel_y for x in sub_kernel_x)

326

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

327

cycles_dpu_blk = 0

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

328

cycles_wb = 32 * ofm_ublock.depth // 8

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

329

330

for num_kernel_elems in sub_kernel_size:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

331

if query.npu_block_type == NpuBlockType.Pooling:

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

332

num_kernel_steps = 1

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

333

cycles = max(4, num_kernel_elems) * num_ublk_xy * num_ublk_z

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

334

if query.ifm_bits == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32:

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

335

cycles *= 2

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

336

elif query.npu_block_type == NpuBlockType.ConvolutionDepthWise:

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

337

cycles = 4 * num_ublk_xy

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

338

if query.ifm_bits == 16:

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

339

cycles *= 2

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

340

num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, 4)

341

cycles = max(cycles_wb, cycles) * num_kernel_steps * num_ublk_z

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

342

elif (

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

343

(query.npu_block_type == NpuBlockType.ConvolutionMxN and not query.config.is_partkernel)

344

or query.npu_block_type == NpuBlockType.VectorProduct

345

or query.npu_block_type == NpuBlockType.ReduceSum

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

346

):

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

347

num_kernel_steps = num_kernel_elems

348

cycles = max(cycles_wb, 4 * num_ublk_xy) * num_kernel_steps * num_ublk_z

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

349

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

350

assert query.config.is_partkernel

351

divider = 2 if query.ifm_bits == 16 else 4

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

352

num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, divider)

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

353

cycles = max(cycles_wb, 4 * num_ublk_xy) * (

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

354

num_kernel_steps * numeric_util.round_up_divide(ifm_block.depth, 8) * num_ublk_z

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

355

)

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

356

357

delay_cycles = 0

358

if arch.accelerator_config is Accelerator.Ethos_U55_32:

359

delay = 7 if use_acc_40bits else 3

360

if num_ublk_x == 1 and num_ublk_y == 1:

361

if num_ublk_z == 1:

362

delay_cycles = delay * num_kernel_steps

363

elif num_kernel_steps > 1:

364

delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z

365

if (num_ublk_x == 1 or num_ublk_y == 1) and num_ublk_z > 1 and use_acc_40bits:

366

delay_cycles += delay * num_ublk_z

367

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

368

if use_acc_40bits and arch.accelerator_config in (Accelerator.Ethos_U55_64, Accelerator.Ethos_U55_128):

delay = 3

else:

delay = 2

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

373

if num_ublk_x == 1 and num_ublk_y == 1:

374

if num_ublk_z == 1:

375

delay_cycles = delay * num_kernel_steps

376

elif num_kernel_steps > 1:

377

delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z

378

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

379

if query.npu_block_type == NpuBlockType.ConvolutionMxN and query.config.is_partkernel:

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

380

delay_cycles *= numeric_util.round_up_divide(ifm_block.depth, 8)

381

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

382

cycles_dpu_blk += cycles

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

383

cycles_dpu_blk += delay_cycles

384

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

385

if query.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):

386

cycles_dpu_blk *= numeric_util.round_up_divide(query.ifm_shape.depth, ifm_block.depth)

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

387

388

cycles_dpu_blk /= arch.ncores

389

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

390

# Estimate output cycles

391

num_ofm_blks = query.ofm_shape.div_round_up(ofm_block).elements()

Johan Alfvén

f8e353b

2022-02-04 17:24:23 +0100

[diff] [blame^]

392

cycles_output_blk = round_up_to_int(

393

_estimate_output_cycles_per_element(arch, op_type, faf_type, query) * ofm_block.elements()

394

)

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

395

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

396

# Scale and bias tensor

397

if query.const_shape.depth > 0:

Diqing Zhong

f842b69

2020-12-11 13:07:37 +0100

[diff] [blame]

398

cycles_bias_blk = (

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

399

10 * ofm_block.depth * arch.memory_latency[query.const_memory_area][BandwidthDirection.Read] / 256

Diqing Zhong

f842b69

2020-12-11 13:07:37 +0100

[diff] [blame]

400

)

Diqing Zhong

2020-11-16 16:15:56 +0100

[diff] [blame]

401

cycles_output_blk = max(cycles_output_blk, cycles_bias_blk)

402

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

403

ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query)

404

cycles_cmd = ifm_blk_cycles + ofm_blk_cycles

405

cycles_cmd = (cycles_cmd + cycles_output_blk + cycles_dpu_blk) / 4 # per DPU

406

Diqing Zhong

2020-11-24 14:38:20 +0100

[diff] [blame]

407

cycles_dpu_blk = max(cycles_dpu_blk, cycles_cmd)

408

cycles_output_blk = max(cycles_output_blk, cycles_cmd)

409

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

410

if cycles_dpu_blk > cycles_output_blk:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

411

total_cycles = cycles_dpu_blk * num_ofm_blks + cycles_output_blk

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

412

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

413

total_cycles = cycles_output_blk * num_ofm_blks + cycles_dpu_blk

Diqing Zhong

2020-09-28 18:46:22 +0200

[diff] [blame]

return total_cycles

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

418

def measure_mem2mem_cycles(arch, from_mem_area, to_mem_area, to_transfer):

419

from_cycles = to_transfer // arch.memory_bandwidths_per_cycle[from_mem_area]

Tim Hall

789e6f3

2021-06-17 17:02:31 +0100

[diff] [blame]

420

from_cycles += arch.memory_latency[from_mem_area][BandwidthDirection.Read]

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

421

to_cycles = to_transfer // arch.memory_bandwidths_per_cycle[to_mem_area]

422

return max(from_cycles, to_cycles)

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

423

Patrik Gustavsson

ee99bb1

2021-04-08 09:04:00 +0200

[diff] [blame]

424

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

425

def measure_cycle_cost(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):

426

cycles = CycleCost()

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

427

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

428

# Convolution/Vector product cycle calculation

429

if query.npu_block_type in (

430

NpuBlockType.ConvolutionMxN,

431

NpuBlockType.ConvolutionDepthWise,

432

NpuBlockType.VectorProduct,

433

NpuBlockType.Pooling,

434

NpuBlockType.ReduceSum,

435

):

436

# cycles.op_macs and cycles.op_cycles should both handle >32-bits

437

if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):

438

cycles.op_macs = int(query.kernel.elements_wh()) * 1 * int(query.ofm_shape.elements())

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

439

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

440

cycles.op_macs = (

441

int(query.kernel.elements_wh()) * int(query.ifm_shape.depth) * int(query.ofm_shape.elements())

442

)

443

444

cycles.op_cycles = int(_estimate_conv_cycles(arch, op_type, faf_type, query))

445

# Elementwise cycle calculation

446

elif query.npu_block_type == NpuBlockType.ElementWise:

447

cycles.op_macs = 0

Johan Alfvén

f8e353b

2022-02-04 17:24:23 +0100

[diff] [blame^]

448

ofm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ofm_format]))

449

cycles.op_cycles = round_up_to_int(

450

_estimate_output_cycles_per_element(arch, op_type, faf_type, query)

451

* Shape4D.round_up(query.ofm_shape, ofm_rounding).elements()

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

452

)

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

453

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

454

assert False

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

455

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

456

return cycles

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

457

458

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

459

def measure_element_access(arch, query: PerformanceQuery):

460

access = ElementAccess()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

461

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

462

ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block)

463

ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block)

464

ifm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ifm_format]))

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

465

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

466

# Number of ofm blocks in the overall output shape

467

ofm_blocks = query.ofm_shape.div_round_up(ofm_block)

468

ofm_block_depth = ofm_block.depth

469

if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):

470

ofm_blocks = ofm_blocks.with_depth(1)

471

ofm_block_depth = query.ifm_shape.depth

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

472

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

473

# Convolution & pooling

474

if query.npu_block_type in (

475

NpuBlockType.ConvolutionMxN,

476

NpuBlockType.ConvolutionDepthWise,

477

NpuBlockType.VectorProduct,

478

NpuBlockType.Pooling,

479

NpuBlockType.ReduceSum,

480

):

481

# Number of sub kernels

482

sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type]

483

subkernels = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[0])

484

subkernels *= numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[1])

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

485

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

486

ofm_block_count = ofm_blocks.elements()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

487

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

488

ifm_fetch = (

489

Shape4D.round_up(ifm_block, ifm_rounding).elements_wh()

490

* Shape4D.round_up(query.ifm_shape, ifm_rounding).depth

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

491

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

492

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

493

if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):

494

kernel_read = query.kernel.elements_wh() * 1 # force to no reread

495

else:

496

kernel_read = query.kernel.elements_wh() * query.ifm_shape.depth

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

497

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

498

weight_fetch = kernel_read * ofm_block_depth * ofm_block_count

499

500

access.ifm_read[0] = ifm_fetch * subkernels * ofm_block_count

501

502

if query.npu_block_type not in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):

503

access.const_read[0] = weight_fetch

504

access.const_read[1] = query.ofm_shape.depth # Scales & biases

505

access.weights_refetch = ofm_blocks.elements_wh()

506

# Elementwise

507

elif query.npu_block_type == NpuBlockType.ElementWise:

508

if query.ifm_shape.elements() == 1:

509

if query.ifm_bits > 8:

510

# ifm is a non 8-bit scalar

511

access.ifm_read[0] = Shape4D.round_up(query.ifm_shape, ifm_rounding).elements()

512

if query.ifm2_shape:

513

access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()

514

else:

515

access.ifm_read[0] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()

516

if query.ifm2_shape:

517

if query.ifm2_shape.elements() > 1:

518

access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()

519

elif query.ifm2_bits > 8:

520

# ifm2 is a non 8-bit scalar

521

access.ifm_read[1] = Shape4D.round_up(query.ifm2_shape, ifm_rounding).elements()

# Unknown

else:

assert False

ofm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ofm_format]))

527

access.ofm_write = Shape4D.round_up(query.ofm_shape, ofm_rounding).elements()

return access

def measure_performance_cost(

532

arch, op_type: Op, faf_type: Op, query: PerformanceQuery, offset: Shape4D, sub_shape: Shape4D

533

):

534

assert (query.ofm_bits > 0) and (query.ifm_bits > 0)

535

assert query.ofm_shape.elements() != 0

536

537

# Default to start if no offset provided

538

if offset is None:

539

offset = Shape4D(0, 0, 0, 0)

540

541

# Default to entire area if no sub-shape provided

542

if sub_shape is None:

543

sub_shape = query.ofm_shape

544

else:

545

sub_shape = Shape4D.min(sub_shape, query.ofm_shape)

546

547

sub_query = copy.deepcopy(query)

548

sub_query.ofm_shape = query.ofm_shape.clip(offset, sub_shape)

549

550

access = ElementAccess()

551

cycles = CycleCost()

552

553

cycle_tmp = measure_cycle_cost(arch, op_type, faf_type, sub_query)

554

cycles += cycle_tmp

555

access = measure_element_access(arch, sub_query)

556

557

return access, cycles

558

559

560

def make_bandwidth_array():

561

return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size))

562

563

564

def make_cycles_array():

565

return np.zeros(PassCycles.Size)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

566

567

Diqing Zhong

2020-11-05 17:18:47 +0100

[diff] [blame]

568

def update_summary_cycles(arch, bws, cycles):

569

cycles[PassCycles.SramAccess] = np.sum(bws[MemArea.Sram]) / arch.memory_bandwidths_per_cycle[MemArea.Sram]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

570

cycles[PassCycles.DramAccess] = np.sum(bws[MemArea.Dram]) / arch.memory_bandwidths_per_cycle[MemArea.Dram]

571

cycles[PassCycles.OnChipFlashAccess] = (

572

np.sum(bws[MemArea.OnChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OnChipFlash]

573

)

574

cycles[PassCycles.OffChipFlashAccess] = (

575

np.sum(bws[MemArea.OffChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OffChipFlash]

576

)

577

578

cycles[PassCycles.Total] = np.max(cycles[: PassCycles.Total])

return cycles

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

582

def estimate_full_op_performance(

583

arch, schedule: Schedule, op: SchedulerOperation, prev_op: SchedulerOperation, block_config

584

):

585

cycles_a = make_cycles_array()

586

bws = make_bandwidth_array()

587

scaled_bws = make_bandwidth_array() # scaled bw with memory transfer efficiency

588

macs = 0

589

590

query = PerformanceQuery(op.op_type.npu_block_type)

591

query.ifm_shape = op.ifm.shape

592

query.ifm_format = op.ifm.format

593

query.ifm_memory_area = op.ifm.mem_area

594

query.ifm_bits = op.ifm.dtype.size_in_bits()

595

query.ifm2_shape = op.ifm2 and op.ifm2.shape

596

query.ifm2_format = op.ifm2 and op.ifm2.format

597

query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area

598

query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()

599

query.ofm_shape = op.ofm.shape

600

query.ofm_memory_area = op.ofm.mem_area

601

query.ofm_bits = op.ofm.dtype.size_in_bits()

602

query.ofm_format = op.ofm.format

603

query.kernel = op.kernel

604

query.config = block_config

605

606

cost = schedule.cost_map[op]

607

prev_cost = schedule.cost_map[prev_op] if prev_op else None

608

if op.parent_op.bias:

609

query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)

610

if cost.buffered_weight_tensor:

611

query.const_memory_area = cost.buffered_weight_tensor.mem_area

612

else:

613

query.const_memory_area = cost.npu_weights_tensor.mem_area

614

615

cycles = measure_cycle_cost(arch, op.op_type, op.parent_op.activation and op.parent_op.activation.op_type, query)

616

cycles_a[PassCycles.Npu] = cycles.op_cycles

617

macs = cycles.op_macs

618

619

access = measure_element_access(arch, query)

620

621

# How many NPU cycles are available under the previously executing

622

# operator for performing buffered DMA transfers

623

slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0

624

625

# LUT Transfer

626

parent_op = op.parent_op

627

lut_transfer_cycles = 0

628

if parent_op.activation_lut:

629

lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0]

630

src_tensor = lut_tensor.src_tensor

631

if src_tensor and lut_tensor.mem_area != src_tensor.mem_area:

632

bw = src_tensor.storage_size()

633

lut_transfer_cycles = measure_mem2mem_cycles(arch, src_tensor.mem_area, lut_tensor.mem_area, bw)

634

635

bws[src_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw

636

# LUT read from SHRAM TODO remove?

Ayaan Masood

d5cbef3

2022-02-22 15:56:35 +0000

[diff] [blame]

637

scaled_bws[lut_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

638

639

if cost.npu_weights_tensor and cost.buffered_weight_tensor:

640

# DMA Weight Transfer

641

sz = 0

642

# Get the size of the first DMA

643

for core in range(0, arch.ncores):

644

key = WeightKey(core, 0)

645

if key in cost.npu_weights_tensor.encoded_ranges:

646

weight_range = cost.npu_weights_tensor.encoded_ranges[key]

647

sz += round_up(weight_range.total_bytes, 16)

648

649

total_sz = len(cost.npu_weights_tensor.buffer)

650

bws[cost.npu_weights_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Read] += total_sz

651

bws[cost.buffered_weight_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Write] += total_sz

652

653

ws_first_transfer_cycles = measure_mem2mem_cycles(

654

arch, cost.npu_weights_tensor.mem_area, cost.buffered_weight_tensor.mem_area, sz

655

)

656

657

# Add cycles for Weight + Scale Transfer

658

cycles_a[PassCycles.Npu] = max(

659

cost.full_weight_transfer_cycles - slack_cycles + cost.slack_buffering_cycles,

660

cycles.op_cycles + max(ws_first_transfer_cycles - slack_cycles, 0),

661

)

662

663

# Add cycles for LUT Transfer

664

cycles_a[PassCycles.Npu] += lut_transfer_cycles

665

else:

666

# Add cycles for LUT Transfer

667

cycles_a[PassCycles.Npu] += max(lut_transfer_cycles - slack_cycles, 0)

668

669

# OFM write

670

ofm = op.parent_op.ofm

671

bw = access.ofm_write * ofm.element_size()

672

bws[query.ofm_memory_area][ofm.purpose][BandwidthDirection.Write] += bw

673

scaled_bws[ofm.mem_area][ofm.purpose][BandwidthDirection.Write] += _estimate_memory_transfer_efficiency(

674

arch, False, query.ofm_memory_area, ofm.format, query.ofm_bits, query.config.ofm_block, query.ofm_shape, bw

)

# IFM read

ifm = op.parent_op.ifm

679

bw = access.ifm_read[0] * ifm.element_size()

680

bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += bw

681

scaled_bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(

682

arch, True, query.ifm_memory_area, ifm.format, query.ifm_bits, query.config.ifm_block, query.ifm_shape, bw

683

)

684

if query.ifm2_shape:

685

ifm2 = op.parent_op.ifm2

686

bw = access.ifm_read[1] * ifm2.element_size()

687

bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += bw

688

scaled_bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(

689

arch,

690

True,

691

query.ifm2_memory_area,

692

ifm2.format,

693

op.ifm2.dtype.size_in_bits(),

694

query.config.ifm_block,

query.ifm2_shape,

bw,

)

# Weight read

if access.const_read[0] > 0:

701

# alignment not accounted for in bandwidth_compression_scale_approx

702

encoded_size_approx = (

703

cost.npu_weights_tensor.elements() - access.const_read[1] * op.parent_op.bias.element_size()

704

)

705

orig_weight_size = parent_op.weights.elements()

706

bandwidth_compression_scale_approx = encoded_size_approx / orig_weight_size

707

bw = access.const_read[0] * bandwidth_compression_scale_approx

708

bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw

709

Patrik Gustavsson

225e19d

2021-06-01 12:43:43 +0200

[diff] [blame]

710

if not cost.buffered_weight_tensor:

711

scaled_bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw

712

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

713

if access.const_read[1] > 0:

714

# Scales & biases

715

bw = access.const_read[1] * op.parent_op.bias.element_size()

716

bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw

717

Patrik Gustavsson

225e19d

2021-06-01 12:43:43 +0200

[diff] [blame]

718

if not cost.buffered_weight_tensor:

719

scaled_bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw

720

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

721

update_summary_cycles(arch, scaled_bws, cycles_a)

722

723

return bws, macs, cycles_a

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

724

725

Ayaan Masood

2022-02-22 11:28:55 +0000

[diff] [blame]

726

def calc_new_performance_for_network(nng: Graph, arch):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

727

total_bws = make_bandwidth_array()

Diqing Zhong

69aadd0

2020-12-08 13:08:48 +0100

[diff] [blame]

728

total_macs = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

729

total_cycles = np.zeros(PassCycles.Size)

Ayaan Masood

2022-02-22 11:28:55 +0000

[diff] [blame]

730

total_weight_size = 0

731

total_encoded_weight_size = 0

732

733

# Store unique instances of original/encoded weight tensor uuids to prevent double counting of weights

734

original_weight_uuids: Set[UUID] = set()

735

encoded_npu_weight_uuids: Set[UUID] = set()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

736

737

for sg in nng.subgraphs:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

738

prev_op = None

739

for sched_op in sg.sched_ops:

Ayaan Masood

2022-02-22 11:28:55 +0000

[diff] [blame]

740

op_info: SchedulerOpInfo = sg.schedule.cost_map[sched_op]

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

741

bws, macs, cycles = estimate_full_op_performance(arch, sg.schedule, sched_op, prev_op, op_info.block_config)

Ayaan Masood

2022-02-22 11:28:55 +0000

[diff] [blame]

742

743

# Tensors for calculating weight sizes

744

original_weight = sched_op.parent_op.weights

745

encoded_npu_weight = op_info.npu_weights_tensor

746

747

# Save UUIDs of original_weight so only unique instances of tensors are used to calculate weights

748

if original_weight and (original_weight.equivalence_id not in original_weight_uuids):

749

750

original_weight_uuids.add(original_weight.equivalence_id)

751

total_weight_size += original_weight.values.itemsize * original_weight.values.size

752

753

# Save UUIDs of encoded_npu_weight so only unique instances of tensors are used to calculate weights

754

if encoded_npu_weight and (encoded_npu_weight.equivalence_id not in encoded_npu_weight_uuids):

755

756

encoded_npu_weight_uuids.add(encoded_npu_weight)

757

total_encoded_weight_size += len(encoded_npu_weight.buffer)

758

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

759

total_bws += bws

760

total_macs += macs

761

total_cycles += cycles

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

762

prev_op = sched_op

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

763

764

nng.bandwidths = total_bws

765

nng.macs = total_macs

766

nng.cycles = total_cycles

Ayaan Masood