Blame - ethosu/vela/weight_compressor.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

1

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

16

# Description:

17

# Compresses and pads the weigths. It also calculates the scales and packs with the biases.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

18

import math

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

19

from collections import namedtuple

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

20

21

import numpy as np

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

22

Manupa Karunaratne

2020-07-20 12:05:32 +0100

[diff] [blame]

23

from .architecture_features import Accelerator

24

from .architecture_features import ArchitectureFeatures

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

25

from .data_type import DataType

Louis Verhaard

7db7896

2020-05-25 15:05:26 +0200

[diff] [blame]

26

from .errors import UnsupportedFeatureError

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

27

from .nn_graph import SchedulingStrategy

28

from .numeric_util import round_up

Patrik Gustavsson

2020-07-08 11:27:12 +0200

[diff] [blame]

29

from .numeric_util import round_up_divide

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

30

from .operation import NpuBlockType

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame^]

31

from .operation import Op

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

32

from .scaling import quantise_scale

33

from .scaling import reduced_quantise_scale

Louis Verhaard

2020-09-23 10:27:11 +0200

[diff] [blame]

34

from .tensor import create_equivalence_id

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

35

from .tensor import TensorBlockTraversal

36

from .tensor import TensorFormat

37

from .tensor import TensorPurpose

38

from .tensor import TensorSubPurpose

Jacob Bohlin

2020-06-23 12:12:56 +0200

[diff] [blame]

39

from ethosu import mlw_codec

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

40

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

41

Louis Verhaard

2020-05-07 08:12:58 +0200

[diff] [blame]

42

# Contains meta info for a weight compression. If two tensors have identical weight compression config,

43

# then they also will have identical compressed weights.

44

WeightCompressionConfig = namedtuple(

Louis Verhaard

2020-09-23 10:27:11 +0200

[diff] [blame]

45

"WeightCompressionConfig", ["npu_block_type", "ofm_block_depth", "ofm_depth_step", "dilation", "value_id"]

Louis Verhaard

2020-05-07 08:12:58 +0200

[diff] [blame]

)

Manupa Karunaratne

2020-07-20 12:05:32 +0100

[diff] [blame]

49

def encode_weights(

50

accelerator: Accelerator,

51

weights_volume: np.ndarray,

52

dilation_xy: tuple,

53

ifm_bitdepth: int,

54

ofm_block_depth: int,

is_depthwise: bool,

is_partkernel: bool,

):

"""

Public facing API to use the ethosu weight encoding.

60

61

:param accelerator: architecture_features.Accelerator enum to pick the correct ethosu accelerator

62

:param weights_volume: numpy.ndarray in OHWI layout with a shape of four

63

:param dilation_xy: a two element tuple of dilation attributes in x,y dimension

64

:param ifm_bitdepth: the bitdepth of input feature map

65

:param ofm_block_depth: the depth of blocks for ethosu processing

66

:param is_depthwise: a boolean indicating these weights are used for a depthwise traversal

67

:param is_partkernel: a boolean indicating these weights are traversed on sub-kernal basis

68

:return: a bytearray of compressed weights

69

"""

70

Manupa Karunaratne

8b24f2b

2020-08-12 18:26:39 +0000

[diff] [blame]

71

# Check arg types

72

assert isinstance(accelerator, Accelerator)

73

assert isinstance(weights_volume, np.ndarray)

74

assert isinstance(dilation_xy, tuple)

75

assert isinstance(ifm_bitdepth, int)

76

assert isinstance(ofm_block_depth, int)

77

assert isinstance(is_depthwise, bool)

78

assert isinstance(is_partkernel, bool)

79

Manupa Karunaratne

2020-07-20 12:05:32 +0100

[diff] [blame]

80

# Checks for weight layout

81

assert len(weights_volume.shape) == 4, "weights ndarray should have a shape of 4"

82

83

# It cannot be both partkernel and depthwise

84

assert not (is_depthwise and is_partkernel), "encode_weights :: partkernel and depthwise are mutually exclusive"

85

86

# Check valid values for dilation

87

assert dilation_xy[0] in (1, 2), "encode_weights :: dilation x should be 1 or 2 not {}".format(dilation_xy[0])

88

assert dilation_xy[1] in (1, 2), "encode_weights :: dilation y should be 1 or 2 not {}".format(dilation_xy[1])

89

90

ifm_ublock = ArchitectureFeatures.accelerator_configs[accelerator].ifm_ublock

91

ofm_ublock = ArchitectureFeatures.accelerator_configs[accelerator].ofm_ublock

92

raw_stream = generate_brick(

93

ifm_ublock=ifm_ublock,

94

ofm_ublock=ofm_ublock,

95

brick_weights=weights_volume,

96

ofm_block_depth=ofm_block_depth,

97

is_depthwise=is_depthwise,

98

is_partkernel=is_partkernel,

99

ifm_bitdepth=ifm_bitdepth,

100

dilation=dilation_xy,

101

)

102

encoded_stream = encode(raw_stream)

103

return encoded_stream

104

105

Manupa Karunaratne

bef228b

2020-07-29 18:06:28 +0100

[diff] [blame]

106

def encode_bias(bias: np.int64, scale: int, shift: int):

107

"""

108

Public facing API to pack bias and scale values as required by the hardware

109

:param bias: 64bit signed number that includes 40bit signed bias

110

:param scale: 32bit scale value

111

:param shift: 6bit shift value

112

:return: packed 80bit [0(2-bits),shift(6-bits),scale(32-bits),bias(40-bits)]

113

"""

Manupa Karunaratne

8b24f2b

2020-08-12 18:26:39 +0000

[diff] [blame]

114

# Check arg types

115

assert isinstance(bias, np.int64)

116

assert isinstance(scale, int)

117

assert isinstance(shift, int)

118

Manupa Karunaratne

bef228b

2020-07-29 18:06:28 +0100

[diff] [blame]

119

assert -(1 << (40 - 1)) <= bias < (1 << (40 - 1)) # signed 40-bit range

120

assert 0 <= scale < (1 << 32) # unsigned 32-bit range

121

assert 0 <= shift < (1 << 6) # unsigned 6-bit range

122

123

data = bytearray(10)

124

data[0] = (bias >> (0 * 8)) & 0xFF

125

data[1] = (bias >> (1 * 8)) & 0xFF

126

data[2] = (bias >> (2 * 8)) & 0xFF

127

data[3] = (bias >> (3 * 8)) & 0xFF

128

data[4] = (bias >> (4 * 8)) & 0xFF

129

data[5] = (scale >> (0 * 8)) & 0xFF

130

data[6] = (scale >> (1 * 8)) & 0xFF

131

data[7] = (scale >> (2 * 8)) & 0xFF

132

data[8] = (scale >> (3 * 8)) & 0xFF

133

data[9] = shift & 0x3F

return data

Louis Verhaard

2020-06-04 15:51:24 +0200

[diff] [blame]

137

def create_weight_compression_config(tens, npu_block_type, ofm_block_depth, ofm_depth_step, dilation):

Louis Verhaard

2020-05-07 08:12:58 +0200

[diff] [blame]

138

# Note: for an ofm block only its depth is used in weight compression.

139

# And block depth > ofm depth gives same result as block depth == ofm depth

140

block_depth = min(ofm_block_depth, tens.quant_values.shape[-1])

Louis Verhaard

2020-09-23 10:27:11 +0200

[diff] [blame]

141

return WeightCompressionConfig(npu_block_type, block_depth, ofm_depth_step, dilation, tens.value_id)

Louis Verhaard

2020-05-07 08:12:58 +0200

[diff] [blame]

142

143

144

def set_storage_shape(tens):

145

# Sets the storage shape depending on the tensor's sub purpose

146

if tens.sub_purpose == TensorSubPurpose.DoubleBuffer and len(tens.compressed_values) > 2:

147

offset = 2 * np.amax([len(x) for x in tens.compressed_values])

148

assert offset % 16 == 0

149

else:

150

offset = tens.weight_compressed_offsets[-1]

151

tens.storage_shape = [1, 1, 1, offset]

152

153

154

class CompressedWeightCache:

155

# Contains weight compressions for all weight tensors in a graph

156

def __init__(self):

157

self.cache = {} # maps from WeightCompressionConfig to a tensor clone containing compressed weights

158

159

def get_tensor_with_same_compression(self, wcc):

160

return self.cache.get(wcc)

161

162

def add(self, tens):

163

# Adds the compressed weights from the tensor to the cache

164

wcc = tens.weight_compression_config

165

# Clone the tensor to make sure that nothing related to the weight compression is modified

166

tens_clone = tens.clone("_weights{}_{}".format(wcc.ofm_block_depth, wcc.ofm_depth_step))

167

self.cache[wcc] = tens_clone

168

169

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

170

def encode(weight_stream):

Patrik Gustavsson

5ff9944

2020-07-10 10:12:17 +0200

[diff] [blame]

171

if len(weight_stream) == 0:

172

return []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

173

assert np.amin(weight_stream) >= -255

174

assert np.amax(weight_stream) <= 255

175

176

# Encode flattened signed weight stream

177

compressed = mlw_codec.encode(weight_stream)

178

179

# pad with 0xFF as needed so the length of the weight stream

180

# is a multiple of 16

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

181

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

182

while (len(compressed) % 16) != 0:

183

compressed.append(0xFF)

return compressed

Manupa Karunaratne

2020-07-20 12:05:32 +0100

[diff] [blame]

188

def generate_brick(

189

ifm_ublock, ofm_ublock, brick_weights, ofm_block_depth, is_depthwise, is_partkernel, ifm_bitdepth, dilation

190

):

191

192

decomp_h = ArchitectureFeatures.SubKernelMax.height // dilation[0]

193

decomp_w = ArchitectureFeatures.SubKernelMax.width // dilation[1]

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

194

# Expect weights formatted OHWI

195

ofm_depth = brick_weights.shape[-4]

196

ifm_depth = brick_weights.shape[-1]

197

kernel_width = brick_weights.shape[-2]

198

kernel_height = brick_weights.shape[-3]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

199

# IFM block depth

200

if is_partkernel or (ifm_bitdepth == 16):

201

# IFM block depth is always 16 for part-kernel-first

202

ifm_block_depth = 16

203

elif ifm_bitdepth == 8:

ifm_block_depth = 32

else:

assert False

stream = []

# Top level striping - OFM blocks in the entire brick's depth

Louis Verhaard

2020-05-07 08:12:58 +0200

[diff] [blame]

211

for ofm_block_z in range(0, ofm_depth, ofm_block_depth):

212

clipped_ofm_block_depth = min(ofm_block_depth, ofm_depth - ofm_block_z)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

213

# IFM blocks required for the brick

214

for ifm_block_z in range(0, (1 if is_depthwise else ifm_depth), ifm_block_depth):

215

if is_depthwise:

216

clipped_ifm_block_depth = ifm_ublock.depth

217

else:

218

clipped_ifm_block_depth = (

219

min(ifm_block_depth, ifm_depth - ifm_block_z) if is_partkernel else ifm_block_depth

220

)

221

# Weight decomposition

222

# Subkernel Splitting (H)

Louis Verhaard

2020-06-04 15:51:24 +0200

[diff] [blame]

223

for subkernel_y in range(0, kernel_height, decomp_h):

224

sub_height = min(kernel_height - subkernel_y, decomp_h)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

225

# Subkernel splitting (W)

Louis Verhaard

2020-06-04 15:51:24 +0200

[diff] [blame]

226

for subkernel_x in range(0, kernel_width, decomp_w):

227

sub_width = min(kernel_width - subkernel_x, decomp_w)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

228

subkernel_elements = sub_width * sub_height

229

# Part kernel first works across the kernel H/W and needs padding

230

if is_partkernel:

231

if ifm_bitdepth == 16 and subkernel_elements % 2 != 0:

232

subkernel_elements = int(math.ceil(subkernel_elements / 2) * 2)

233

elif ifm_bitdepth == 8 and subkernel_elements % 4 != 0:

234

subkernel_elements = int(math.ceil(subkernel_elements / 4) * 4)

235

236

# Depthwise Conv requires multiple of 4 kernel elements in its weight block

237

# this is different from normal conv which is considered "weights depth-first"

238

elif is_depthwise:

239

subkernel_elements = int(math.ceil(subkernel_elements / 4.0) * 4)

240

241

ifm_block_depth_outer = clipped_ifm_block_depth if is_partkernel else 1

242

ifm_block_depth_inner = 1 if is_partkernel else clipped_ifm_block_depth

243

# IFM Ublocks in IFM-block over depth for part-kernel-first mode

244

# For depth-first IFM Ublocks are traversed after subkernel elements so this loop is ignored.

245

for ifm_ublk_outer in range(0, ifm_block_depth_outer, ifm_ublock.depth):

246

# OFM Ublocks in OFM-block over depth

247

for ofm_ublk in range(0, clipped_ofm_block_depth, ofm_ublock.depth):

248

# HW Kernel element traversal - cannot be a H/W loop due to element

249

# padding requirement on depthwise/part-kernel configurations

250

for element in range(subkernel_elements):

251

kx = element % sub_width

252

ky = element // sub_width

253

# IFM Ublocks in IFM-block over depth (only 1 ublock if depthwise)

254

# In case of part-kernel-first IFM Ublock traversal have already been handled

255

# and this loop is ignored.

256

for ifm_ublk_inner in range(0, ifm_block_depth_inner, ifm_ublock.depth):

257

# Feed OFM ublock elements

258

for ofm_ublock_z in range(ofm_ublock.depth):

259

# Source IFM ublock elements (only 1 element deep if depthwise)

260

for ifm_ublock_z in range(1 if is_depthwise else ifm_ublock.depth):

261

# Source position within the current subkernel

262

wx = subkernel_x + kx

263

wy = subkernel_y + ky

264

# Source IFM/OFM slices

265

ifm_ublk = ifm_ublk_inner + ifm_ublk_outer

266

ifm_z = ifm_block_z + ifm_ublk + ifm_ublock_z

267

ofm_z = ofm_block_z + ofm_ublk + ofm_ublock_z

268

if (ifm_z >= ifm_depth) or (ofm_z >= ofm_depth) or (ky >= sub_height):

269

stream.append(0)

270

else:

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

271

stream.append(brick_weights[ofm_z][wy][wx][ifm_z])

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

272

return stream

273

Jacob Bohlin

2020-06-23 12:12:56 +0200

[diff] [blame]

274

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

275

def core_deinterleave(hwio, core, ncores):

276

# Put weights back into OHWI

Jacob Bohlin

2020-06-23 12:12:56 +0200

[diff] [blame]

277

ohwi = np.transpose(hwio, (3, 0, 1, 2))

278

return ohwi[core : ohwi.shape[0] : ncores]

279

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

280

281

# Compress the weights

Louis Verhaard

2020-06-04 15:51:24 +0200

[diff] [blame]

282

def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth_step, dilation):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

283

assert tens.purpose == TensorPurpose.Weights

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

284

Louis Verhaard

2020-05-07 08:12:58 +0200

[diff] [blame]

285

# Check the weight cache

286

if nng.weight_cache is None:

287

nng.weight_cache = CompressedWeightCache()

Louis Verhaard

2020-06-04 15:51:24 +0200

[diff] [blame]

288

wcc = create_weight_compression_config(tens, npu_block_type, ofm_block_depth, ofm_depth_step, dilation)

Louis Verhaard

2020-05-07 08:12:58 +0200

[diff] [blame]

289

tens.weight_compression_config = wcc

Louis Verhaard

2020-09-23 10:27:11 +0200

[diff] [blame]

290

# Reassign equivalence id such that tensors with same weight compression get identical equivalence ids,

291

# but tensors with the same values but different compression get different equivalence ids

292

tens.equivalence_id = create_equivalence_id(wcc)

Louis Verhaard

2020-05-07 08:12:58 +0200

[diff] [blame]

293

tens_cached = nng.weight_cache.get_tensor_with_same_compression(wcc)

294

if tens_cached is not None:

295

# Cache hit, copy weights from the cache

296

tens.copy_compressed_weight_info(tens_cached)

297

set_storage_shape(tens)

298

return

Louis Verhaard

2020-05-07 08:12:58 +0200

[diff] [blame]

299

# No cache hit, perform the compression

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

300

assert tens.quantization is not None

301

assert tens.quantization.scale_f32 is not None

302

assert tens.quantization.zero_point is not None

303

304

zero_point = tens.quantization.zero_point

305

quant_buf = tens.quant_values.astype(np.int64)

306

307

# Early zero-point correction

308

weights = quant_buf - zero_point

309

310

if len(weights.shape) == 2:

311

weights = np.expand_dims(np.expand_dims(weights, axis=0), axis=0)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

312

313

compression_scales = []

314

compressed_offsets = []

315

encoded_streams = []

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

316

encoded_streams_substream_offsets = []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

317

offset = 0

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

318

max_single_buffer_len = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

319

320

ifm_bitdepth = tens.consumer_list[0].inputs[0].dtype.size_in_bits()

321

ifm_depth = weights.shape[-2]

322

if npu_block_type == NpuBlockType.ConvolutionDepthWise:

323

tens.block_traversal = TensorBlockTraversal.DepthWise

324

if npu_block_type == NpuBlockType.ConvolutionMxN:

325

# Determine which block traversal strategy has better DPU utilization

Jacob Bohlin

de2a57f

2020-08-10 15:21:42 +0200

[diff] [blame]

326

kernel_size = weights.shape[0] * weights.shape[1]

327

depth_utilization = weights.shape[2] / round_up(weights.shape[2], 32 if ifm_bitdepth == 8 else 16)

328

part_kernel_utilization = (weights.shape[2] / round_up(weights.shape[2], 8)) * (

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

329

kernel_size / round_up(kernel_size, 4 if ifm_bitdepth == 8 else 2)

330

)

331

if part_kernel_utilization >= depth_utilization or ifm_depth <= 8:

332

# Part-kernel first is always better for ifm depths <= 8

333

tens.block_traversal = TensorBlockTraversal.PartKernelFirst

334

else:

335

tens.block_traversal = TensorBlockTraversal.DepthFirst

336

Manupa Karunaratne

2020-07-20 12:05:32 +0100

[diff] [blame]

337

is_depthwise = tens.block_traversal == TensorBlockTraversal.DepthWise

338

is_partkernel = tens.block_traversal == TensorBlockTraversal.PartKernelFirst

339

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame^]

340

if tens.consumer_list[0].type == Op.Conv2DBackpropInputSwitchedBias:

Jacob Bohlin

cf7da10

2020-05-20 09:03:40 +0200

[diff] [blame]

341

# Transpose Convoluion, reverse weights in H and W axes

Tim Hall

c30f495

2020-06-15 20:47:35 +0100

[diff] [blame]

342

weights = np.flip(weights, axis=(0, 1))

Jacob Bohlin

cf7da10

2020-05-20 09:03:40 +0200

[diff] [blame]

343

Jacob Bohlin

2020-06-23 12:12:56 +0200

[diff] [blame]

344

# Calculate brick size

Jacob Bohlin

de2a57f

2020-08-10 15:21:42 +0200

[diff] [blame]

345

brick_size = (weights.shape[0], weights.shape[1], weights.shape[2], min(tens.shape[-1], ofm_depth_step))

Jacob Bohlin

2020-06-23 12:12:56 +0200

[diff] [blame]

346

elements_in_brick = np.prod(brick_size)

347

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

348

# Slice weight stream up depth-ways into bricks and compress

349

full_ofm_depth = quant_buf.shape[-1]

350

for idx in range(0, full_ofm_depth, ofm_depth_step):

351

# Get the weights necessary for this brick

352

count = min(full_ofm_depth - idx, ofm_depth_step)

353

brick_weights = weights[:, :, :, idx : idx + count]

354

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

355

substream_offsets = [0]

356

encoded_stream = []

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

357

358

# For each core, deinterleave weights from the larger volume

359

# and generate separate compressed streams.

360

for core in range(0, min(arch.ncores, full_ofm_depth)):

361

core_weights = core_deinterleave(brick_weights, core, arch.ncores)

Tim Hall

6231676

2020-06-25 16:55:02 +0100

[diff] [blame]

362

363

block_depth = (ofm_block_depth + arch.ncores - 1 - core) // arch.ncores

Manupa Karunaratne

2020-07-20 12:05:32 +0100

[diff] [blame]

364

encoded_substream = []

Tim Hall

6231676

2020-06-25 16:55:02 +0100

[diff] [blame]

365

if block_depth != 0:

Manupa Karunaratne

2020-07-20 12:05:32 +0100

[diff] [blame]

366

encoded_substream = encode_weights(

367

accelerator=arch.accelerator_config,

368

weights_volume=core_weights,

369

dilation_xy=dilation,

370

ifm_bitdepth=ifm_bitdepth,

371

ofm_block_depth=block_depth,

372

is_depthwise=is_depthwise,

373

is_partkernel=is_partkernel,

Jacob Bohlin

2020-06-23 12:12:56 +0200

[diff] [blame]

374

)

Jacob Bohlin

2020-06-23 12:12:56 +0200

[diff] [blame]

375

encoded_stream.extend(encoded_substream)

376

substream_offsets.append(len(encoded_stream))

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

377

Jacob Bohlin

2020-06-23 12:12:56 +0200

[diff] [blame]

378

encoded_streams.append(encoded_stream)

379

encoded_streams_substream_offsets.append(substream_offsets)

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

380

381

# Remember maximum encoded length for DoubleBuffering

382

max_single_buffer_len = max(max_single_buffer_len, len(encoded_stream))

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

383

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

384

# Remember where we put it for linear addressing

385

compressed_offsets.append(offset)

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

386

offset += len(encoded_stream)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

387

assert offset % 16 == 0

388

389

# Compression scale tracking

Jacob Bohlin

2020-06-23 12:12:56 +0200

[diff] [blame]

390

compression_scales.append(len(encoded_stream) / elements_in_brick)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

391

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

392

# Track total length as last element of the offsets array

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

393

compressed_offsets.append(offset)

394

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

395

tens.weight_compression_scales = compression_scales

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

396

tens.weight_compressed_offsets = compressed_offsets

397

tens.compression_scale_for_worst_weight_stream = np.amax(compression_scales)

398

tens.storage_compression_scale = tens.bandwidth_compression_scale = np.average(compression_scales)

399

tens.compressed_values = encoded_streams

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

400

tens.compressed_values_substream_offsets = encoded_streams_substream_offsets

Jacob Bohlin

2020-06-23 12:12:56 +0200

[diff] [blame]

401

tens.brick_size = brick_size

Louis Verhaard

2020-05-07 08:12:58 +0200

[diff] [blame]

402

set_storage_shape(tens)

403

nng.weight_cache.add(tens)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

404

Jacob Bohlin

2020-06-23 12:12:56 +0200

[diff] [blame]

405

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

406

def calc_scales_and_pack_biases(tens, arch, ofm_depth_step, rescale_for_faf=False):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

407

assert tens.purpose == TensorPurpose.FeatureMap

408

assert tens.format == TensorFormat.NHWC

409

# the connected operator should expect a bias input unless it is a FullyConnected

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame^]

410

assert tens.consumer_list[0].type.needs_bias()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

411

# the input bias tensor is the same as that connected to the operator

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame^]

412

bias_tens = tens.consumer_list[0].bias

Jacob Bohlin

cf7da10

2020-05-20 09:03:40 +0200

[diff] [blame]

413

assert tens is bias_tens

414

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

415

# the operator should only have a single output

416

assert len(tens.consumer_list[0].outputs) == 1

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

417

biases = tens.quant_values

418

419

first_consumer_op = tens.consumer_list[0]

420

ifm_dtype = first_consumer_op.inputs[0].dtype

421

ifm_scale = first_consumer_op.inputs[0].quantization.scale_f32

Louis Verhaard

98a3499

2020-09-01 10:39:04 +0200

[diff] [blame]

422

ofm_scale = first_consumer_op.get_output_quantization().scale_f32

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

423

weight_scales = first_consumer_op.inputs[1].quantization.scale_f32

424

425

# biases can have multiple consumers for rnn cells. if so, then check that they are all the same

426

for op in tens.consumer_list[1:]:

427

assert ifm_scale == op.inputs[0].quantization.scale_f32

Louis Verhaard

98a3499

2020-09-01 10:39:04 +0200

[diff] [blame]

428

assert ofm_scale == op.get_output_quantization().scale_f32

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

429

assert weight_scales == op.inputs[1].quantization.scale_f32

430

431

if not hasattr(weight_scales, "__iter__"):

432

# If weight_scales is not already an iterable make it into a list

433

weight_scales = [weight_scales]

434

435

# Convert scales to np.double (from np.float32) to conform to TensorFlow Lite which

436

# uses double during scaling calculations

437

# TensorFlow Lite casts the scales slightly differently for uint8 and int8

438

if not rescale_for_faf:

439

if ifm_dtype == DataType.uint8:

440

scales = [np.double(ifm_scale * weight_scale) / np.double(ofm_scale) for weight_scale in weight_scales]

Fredrik Svedberg

d67c0aa

2020-03-30 13:15:28 +0200

[diff] [blame]

441

elif ifm_dtype == DataType.int8 or ifm_dtype == DataType.int16:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

442

scales = [

443

(np.double(ifm_scale) * np.double(weight_scale)) / np.double(ofm_scale)

444

for weight_scale in weight_scales

445

]

446

else:

Louis Verhaard

7db7896

2020-05-25 15:05:26 +0200

[diff] [blame]

447

raise UnsupportedFeatureError(

448

"Compression of {} is not implemented; tensor: {}".format(ifm_dtype, tens.name)

449

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

450

else:

451

if ifm_dtype == DataType.uint8:

452

scales = [np.double(ifm_scale * weight_scale * 0x3000) for weight_scale in weight_scales]

Fredrik Svedberg

d67c0aa

2020-03-30 13:15:28 +0200

[diff] [blame]

453

elif ifm_dtype == DataType.int8 or ifm_dtype == DataType.int16:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

454

scales = [(np.double(ifm_scale * 0x3000) * np.double(weight_scale)) for weight_scale in weight_scales]

455

else:

Louis Verhaard

7db7896

2020-05-25 15:05:26 +0200

[diff] [blame]

456

raise UnsupportedFeatureError(

457

"Compression of {} is not implemented; tensor: {}".format(ifm_dtype, tens.name)

458

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

459

460

# quantise all of the weight scales into (scale_factor, shift)

Fredrik Svedberg

d67c0aa

2020-03-30 13:15:28 +0200

[diff] [blame]

461

if ifm_dtype == DataType.int16:

462

quantised_scales = [reduced_quantise_scale(scale) for scale in scales]

463

else:

464

quantised_scales = [quantise_scale(scale) for scale in scales]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

465

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

466

# pack the biases and scales

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

467

if len(quantised_scales) == 1:

468

# If only 1 quantised scale is used, repeat that value for the length of the biases

469

quantised_scales = [quantised_scales[0]] * len(biases)

470

471

assert len(quantised_scales) == len(biases)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

472

tens.element_size_bytes = 10

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

473

tens.compressed_values = []

474

tens.compressed_values_substream_offsets = []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

475

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

476

total_elements = len(quantised_scales)

Patrik Gustavsson

2020-07-08 11:27:12 +0200

[diff] [blame]

477

alignment_bytes = 0

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

478

for i in range(0, total_elements, ofm_depth_step):

479

# Extract streams from brick to generate substreams for each core

480

stream = bytearray()

481

substream_offsets = [0]

482

max_len = min(ofm_depth_step, total_elements - i)

483

for core in range(0, min(arch.ncores, max_len)):

Jacob Bohlin

2020-06-23 12:12:56 +0200

[diff] [blame]

484

core_scales = quantised_scales[i + core : i + core + max_len : arch.ncores]

485

core_biases = biases[i + core : i + core + max_len : arch.ncores]

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

486

for j, core_bias in enumerate(core_biases):

Manupa Karunaratne

bef228b

2020-07-29 18:06:28 +0100

[diff] [blame]

487

stream.extend(encode_bias(np.int64(core_bias), *core_scales[j]))

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

488

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

489

# Align to 16 for start for next substream

Jacob Bohlin

2020-06-23 12:12:56 +0200

[diff] [blame]

490

remainder = (len(stream)) % 16

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

491

if remainder > 0:

Jacob Bohlin

2020-06-23 12:12:56 +0200

[diff] [blame]

492

stream.extend(bytearray(16 - remainder))

Patrik Gustavsson

2020-07-08 11:27:12 +0200

[diff] [blame]

493

alignment_bytes += 16 - remainder

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

494

Jacob Bohlin

2020-06-23 12:12:56 +0200

[diff] [blame]

495

substream_offsets.append(len(stream))

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

496

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

497

# Add to compressed values with their substream offset lists to the tensor

Jacob Bohlin

2020-06-23 12:12:56 +0200

[diff] [blame]

498

tens.compressed_values.append(stream)

499

tens.compressed_values_substream_offsets.append(substream_offsets)

Tim Hall

2020-06-25 15:04:31 +0100

[diff] [blame]

500

Patrik Gustavsson

2020-07-08 11:27:12 +0200

[diff] [blame]

501

tens.storage_shape = [total_elements + round_up_divide(alignment_bytes, tens.element_size_bytes)]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

502

Jacob Bohlin

2020-06-23 12:12:56 +0200

[diff] [blame]

503

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

504

def update_pass_weight_and_scale_tensors(nng, arch):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

505

for sg in nng.subgraphs:

506

for ps in sg.passes:

Louis Verhaard

2020-05-07 08:12:58 +0200

[diff] [blame]

507

tens = ps.weight_tensor

508

if tens is not None:

Louis Verhaard

2020-06-04 15:51:24 +0200

[diff] [blame]

509

op = tens.find_npu_op()

Dwight Lidman

940fdee

2020-08-13 13:11:48 +0200

[diff] [blame]

510

if op is None:

511

continue

Louis Verhaard

2020-05-07 08:12:58 +0200

[diff] [blame]

512

needs_dma = tens.needs_dma()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

513

if ps.cascade.strategy == SchedulingStrategy.WeightStream and needs_dma:

514

ofm_depth_step = ps.block_config[-1]

515

else:

Louis Verhaard

2020-05-07 08:12:58 +0200

[diff] [blame]

516

ofm_depth_step = tens.shape[-1]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

517

compress_weights(

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame^]

518

arch, nng, tens, op.type.npu_block_type, ps.block_config[-1], ofm_depth_step, op.get_dilation_h_w()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

519

)

520

# Update source tensor

Louis Verhaard

2020-05-07 08:12:58 +0200

[diff] [blame]

521

if needs_dma:

522

src_tens = tens.get_dma_src_tensor()

523

src_tens.shape = tens.shape

524

src_tens.quant_values = tens.quant_values

525

src_tens.copy_compressed_weight_info(tens)

526

set_storage_shape(src_tens)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

527

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

528

if ps.scale_tensor is not None:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

529

rescale_for_faf = False

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame^]

530

activation_ops = set((Op.Sigmoid, Op.Tanh))

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

531

if (ps.ops[-1].type in activation_ops) and (ps.npu_block_type != NpuBlockType.ElementWise):

532

rescale_for_faf = True

Tim Hall