Blame - ethosu/vela/weight_compressor.py - ml/ethos-u/ethos-u-vela

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

1

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

# Description:

# Compresses and pads the weigths. It also calculates the scales and packs with the biases.

import os

import sys

import enum

import math

import numpy as np

from collections import namedtuple

27

from .numeric_util import round_up

Fredrik Svedberg

d67c0aa

2020-03-30 13:15:28 +0200

[diff] [blame^]

28

from .scaling import quantise_scale, reduced_quantise_scale

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

29

from .tensor import TensorPurpose, TensorSubPurpose, TensorFormat, TensorBlockTraversal

30

from .operation import NpuBlockType

31

from .architecture_features import Block

32

from .nn_graph import SchedulingStrategy

33

from .data_type import DataType

34

35

from ethosu import mlw_codec

36

37

38

def encode(weight_stream):

39

assert np.amin(weight_stream) >= -255

40

assert np.amax(weight_stream) <= 255

41

42

# Encode flattened signed weight stream

43

compressed = mlw_codec.encode(weight_stream)

44

45

# pad with 0xFF as needed so the length of the weight stream

46

# is a multiple of 16

47

48

while (len(compressed) % 16) != 0:

49

compressed.append(0xFF)

return compressed

def generate_brick(arch, brick_weights, ofm_block, block_traversal, ifm_bitdepth):

55

is_depthwise = block_traversal == TensorBlockTraversal.DepthWise

56

is_partkernel = block_traversal == TensorBlockTraversal.PartKernelFirst

57

subkernel_max = arch.subkernel_max

58

ofm_ublock = arch.ofm_ublock

59

ifm_ublock = arch.ifm_ublock

60

# Expect weights formatted HWIO

61

ofm_depth = brick_weights.shape[-1]

62

ifm_depth = brick_weights.shape[-2]

63

kernel_width = brick_weights.shape[-3]

64

kernel_height = brick_weights.shape[-4]

65

# IFM block depth

66

if is_partkernel or (ifm_bitdepth == 16):

67

# IFM block depth is always 16 for part-kernel-first

68

ifm_block_depth = 16

69

elif ifm_bitdepth == 8:

ifm_block_depth = 32

else:

assert False

stream = []

# Top level striping - OFM blocks in the entire brick's depth

77

for ofm_block_z in range(0, ofm_depth, ofm_block.depth):

78

clipped_ofm_block_depth = min(ofm_block.depth, ofm_depth - ofm_block_z)

79

# IFM blocks required for the brick

80

for ifm_block_z in range(0, (1 if is_depthwise else ifm_depth), ifm_block_depth):

81

if is_depthwise:

82

clipped_ifm_block_depth = ifm_ublock.depth

83

else:

84

clipped_ifm_block_depth = (

85

min(ifm_block_depth, ifm_depth - ifm_block_z) if is_partkernel else ifm_block_depth

86

)

87

# Weight decomposition

88

# Subkernel Splitting (H)

89

for subkernel_y in range(0, kernel_height, subkernel_max.height):

90

sub_height = min(kernel_height - subkernel_y, subkernel_max.height)

91

# Subkernel splitting (W)

92

for subkernel_x in range(0, kernel_width, subkernel_max.width):

93

sub_width = min(kernel_width - subkernel_x, subkernel_max.width)

94

subkernel_elements = sub_width * sub_height

95

# Part kernel first works across the kernel H/W and needs padding

96

if is_partkernel:

97

if ifm_bitdepth == 16 and subkernel_elements % 2 != 0:

98

subkernel_elements = int(math.ceil(subkernel_elements / 2) * 2)

99

elif ifm_bitdepth == 8 and subkernel_elements % 4 != 0:

100

subkernel_elements = int(math.ceil(subkernel_elements / 4) * 4)

101

102

# Depthwise Conv requires multiple of 4 kernel elements in its weight block

103

# this is different from normal conv which is considered "weights depth-first"

104

elif is_depthwise:

105

subkernel_elements = int(math.ceil(subkernel_elements / 4.0) * 4)

106

107

ifm_block_depth_outer = clipped_ifm_block_depth if is_partkernel else 1

108

ifm_block_depth_inner = 1 if is_partkernel else clipped_ifm_block_depth

109

# IFM Ublocks in IFM-block over depth for part-kernel-first mode

110

# For depth-first IFM Ublocks are traversed after subkernel elements so this loop is ignored.

111

for ifm_ublk_outer in range(0, ifm_block_depth_outer, ifm_ublock.depth):

112

# OFM Ublocks in OFM-block over depth

113

for ofm_ublk in range(0, clipped_ofm_block_depth, ofm_ublock.depth):

114

# HW Kernel element traversal - cannot be a H/W loop due to element

115

# padding requirement on depthwise/part-kernel configurations

116

for element in range(subkernel_elements):

117

kx = element % sub_width

118

ky = element // sub_width

119

# IFM Ublocks in IFM-block over depth (only 1 ublock if depthwise)

120

# In case of part-kernel-first IFM Ublock traversal have already been handled

121

# and this loop is ignored.

122

for ifm_ublk_inner in range(0, ifm_block_depth_inner, ifm_ublock.depth):

123

# Feed OFM ublock elements

124

for ofm_ublock_z in range(ofm_ublock.depth):

125

# Source IFM ublock elements (only 1 element deep if depthwise)

126

for ifm_ublock_z in range(1 if is_depthwise else ifm_ublock.depth):

127

# Source position within the current subkernel

128

wx = subkernel_x + kx

129

wy = subkernel_y + ky

130

# Source IFM/OFM slices

131

ifm_ublk = ifm_ublk_inner + ifm_ublk_outer

132

ifm_z = ifm_block_z + ifm_ublk + ifm_ublock_z

133

ofm_z = ofm_block_z + ofm_ublk + ofm_ublock_z

134

if (ifm_z >= ifm_depth) or (ofm_z >= ofm_depth) or (ky >= sub_height):

135

stream.append(0)

136

else:

137

stream.append(brick_weights[wy][wx][ifm_z][ofm_z])

return stream

# Compress the weights

142

def compress_weights(tens, arch, npu_block_type, ofm_block, ofm_depth_step, min_val=None, max_val=None):

143

assert tens.purpose == TensorPurpose.Weights

144

assert tens.format == TensorFormat.WeightsCompressed

145

146

WeightCompressionConfig = namedtuple("WeightCompressionConfig", ["npu_block_type", "ofm_block", "ofm_depth_step"])

147

148

# check if weights have already been compressed

149

wcc = tens.weight_compression_config

150

if wcc is not None:

151

assert wcc.npu_block_type == npu_block_type, "Weights not used by the same operator type"

152

153

if wcc.ofm_block == ofm_block and wcc.ofm_depth_step == ofm_depth_step:

154

return

155

156

assert tens.quantization is not None

157

assert tens.quantization.scale_f32 is not None

158

assert tens.quantization.zero_point is not None

159

160

zero_point = tens.quantization.zero_point

161

quant_buf = tens.quant_values.astype(np.int64)

162

163

# Early zero-point correction

164

weights = quant_buf - zero_point

165

166

if len(weights.shape) == 2:

167

weights = np.expand_dims(np.expand_dims(weights, axis=0), axis=0)

168

weights_shape = (weights.shape[0], 1, 1, weights.shape[1])

169

else:

170

weights_shape = weights.shape

171

172

compression_scales = []

173

compressed_offsets = []

174

encoded_streams = []

175

offset = 0

176

max_single_buffer_len = 0

177

178

ifm_bitdepth = tens.consumer_list[0].inputs[0].dtype.size_in_bits()

179

ifm_depth = weights.shape[-2]

180

if npu_block_type == NpuBlockType.ConvolutionDepthWise:

181

tens.block_traversal = TensorBlockTraversal.DepthWise

182

if npu_block_type == NpuBlockType.ConvolutionMxN:

183

# Determine which block traversal strategy has better DPU utilization

184

kernel_size = weights_shape[0] * weights_shape[1]

185

depth_utilization = weights_shape[2] / round_up(weights_shape[2], 32 if ifm_bitdepth == 8 else 16)

186

part_kernel_utilization = (weights_shape[2] / round_up(weights_shape[2], 8)) * (

187

kernel_size / round_up(kernel_size, 4 if ifm_bitdepth == 8 else 2)

188

)

189

if part_kernel_utilization >= depth_utilization or ifm_depth <= 8:

190

# Part-kernel first is always better for ifm depths <= 8

191

tens.block_traversal = TensorBlockTraversal.PartKernelFirst

192

else:

193

tens.block_traversal = TensorBlockTraversal.DepthFirst

194

195

# Slice weight stream up depth-ways into bricks and compress

196

full_ofm_depth = quant_buf.shape[-1]

197

for idx in range(0, full_ofm_depth, ofm_depth_step):

198

# Get the weights necessary for this brick

199

count = min(full_ofm_depth - idx, ofm_depth_step)

200

brick_weights = weights[:, :, :, idx : idx + count]

201

202

# Encode all weights into one chunk

203

raw_stream = generate_brick(arch, brick_weights, ofm_block, tens.block_traversal, ifm_bitdepth)

204

encoded = encode(raw_stream)

205

encoded_streams.append(encoded)

206

207

# Remember maximum encoded length for DoubleBuffering

208

if max_single_buffer_len < len(encoded):

209

max_single_buffer_len = len(encoded)

210

211

# Remember where we put it for linear addressing

212

compressed_offsets.append(offset)

213

offset += len(encoded)

214

assert offset % 16 == 0

215

216

# Compression scale tracking

217

compression_scales.append(len(encoded) / len(raw_stream))

218

219

# Also track complete length in the offsets array

220

compressed_offsets.append(offset)

221

222

if tens.sub_purpose == TensorSubPurpose.DoubleBuffer and len(encoded_streams) > 2:

223

offset = 2 * max_single_buffer_len

224

assert offset % 16 == 0

225

226

tens.storage_shape = [1, 1, 1, offset]

227

tens.weight_compression_scales = compression_scales

228

tens.weight_compression_config = WeightCompressionConfig(npu_block_type, ofm_block, ofm_depth_step)

229

tens.weight_compressed_offsets = compressed_offsets

230

tens.compression_scale_for_worst_weight_stream = np.amax(compression_scales)

231

tens.storage_compression_scale = tens.bandwidth_compression_scale = np.average(compression_scales)

232

tens.compressed_values = encoded_streams

233

tens.brick_size = (weights_shape[0], weights_shape[1], weights_shape[2], min(tens.shape[-1], ofm_depth_step))

234

235

236

def calc_scales_and_pack_biases(tens, arch, oc_quantum, rescale_for_faf=False):

237

assert tens.purpose == TensorPurpose.FeatureMap

238

assert tens.format == TensorFormat.NHWC

239

# the connected operator should expect a bias input unless it is a FullyConnected

240

assert "Bias" in tens.consumer_list[0].type or tens.consumer_list[0].type.startswith("FullyConnected")

241

# the input bias tensor is the same as that connected to the operator

242

assert tens is tens.consumer_list[0].inputs[2]

243

# the operator should only have a single output

244

assert len(tens.consumer_list[0].outputs) == 1

245

246

def pack_bias_and_scale(bias, scale, shift):

247

bias = np.int64(bias)

248

assert -(1 << (40 - 1)) <= bias < (1 << (40 - 1)) # signed 40-bit range

249

assert 0 <= scale < (1 << 32) # unsigned 32-bit range

250

assert 0 <= shift < (1 << 6) # unsigned 6-bit range

251

252

# pack the 80 bit value = [0(2-bits),shift(6-bits),scale(32-bits),bias(40-bits)]

253

data = bytearray(10)

254

data[0] = (bias >> (0 * 8)) & 0xFF

255

data[1] = (bias >> (1 * 8)) & 0xFF

256

data[2] = (bias >> (2 * 8)) & 0xFF

257

data[3] = (bias >> (3 * 8)) & 0xFF

258

data[4] = (bias >> (4 * 8)) & 0xFF

259

data[5] = (scale >> (0 * 8)) & 0xFF

260

data[6] = (scale >> (1 * 8)) & 0xFF

261

data[7] = (scale >> (2 * 8)) & 0xFF

262

data[8] = (scale >> (3 * 8)) & 0xFF

263

data[9] = shift & 0x3F

264

return data

265

266

biases = tens.quant_values

267

268

first_consumer_op = tens.consumer_list[0]

269

ifm_dtype = first_consumer_op.inputs[0].dtype

270

ifm_scale = first_consumer_op.inputs[0].quantization.scale_f32

271

ofm_scale = first_consumer_op.outputs[0].quantization.scale_f32

272

weight_scales = first_consumer_op.inputs[1].quantization.scale_f32

273

274

# biases can have multiple consumers for rnn cells. if so, then check that they are all the same

275

for op in tens.consumer_list[1:]:

276

assert ifm_scale == op.inputs[0].quantization.scale_f32

277

assert ofm_scale == op.outputs[0].quantization.scale_f32

278

assert weight_scales == op.inputs[1].quantization.scale_f32

279

280

if not hasattr(weight_scales, "__iter__"):

281

# If weight_scales is not already an iterable make it into a list

282

weight_scales = [weight_scales]

283

284

# Convert scales to np.double (from np.float32) to conform to TensorFlow Lite which

285

# uses double during scaling calculations

286

# TensorFlow Lite casts the scales slightly differently for uint8 and int8

287

if not rescale_for_faf:

288

if ifm_dtype == DataType.uint8:

289

scales = [np.double(ifm_scale * weight_scale) / np.double(ofm_scale) for weight_scale in weight_scales]

Fredrik Svedberg

d67c0aa

2020-03-30 13:15:28 +0200

[diff] [blame^]

290

elif ifm_dtype == DataType.int8 or ifm_dtype == DataType.int16:

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

291

scales = [

292

(np.double(ifm_scale) * np.double(weight_scale)) / np.double(ofm_scale)

293

for weight_scale in weight_scales

294

]

295

else:

296

assert False, str(ifm_dtype) + " not implemented"

297

else:

298

if ifm_dtype == DataType.uint8:

299

scales = [np.double(ifm_scale * weight_scale * 0x3000) for weight_scale in weight_scales]

Fredrik Svedberg

d67c0aa

2020-03-30 13:15:28 +0200

[diff] [blame^]

300

elif ifm_dtype == DataType.int8 or ifm_dtype == DataType.int16:

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

301

scales = [(np.double(ifm_scale * 0x3000) * np.double(weight_scale)) for weight_scale in weight_scales]

302

else:

303

assert False, str(ifm_dtype) + " not implemented"

304

305

# quantise all of the weight scales into (scale_factor, shift)

Fredrik Svedberg

d67c0aa

2020-03-30 13:15:28 +0200

[diff] [blame^]

306

if ifm_dtype == DataType.int16:

307

quantised_scales = [reduced_quantise_scale(scale) for scale in scales]

308

else:

309

quantised_scales = [quantise_scale(scale) for scale in scales]

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

310

311

for _, shift in quantised_scales:

312

assert shift >= 16

313

314

# pack the biases and scales

315

tens.compressed_values = []

316

if len(quantised_scales) == 1:

317

# If only 1 quantised scale is used, repeat that value for the length of the biases

318

quantised_scales = [quantised_scales[0]] * len(biases)

319

320

assert len(quantised_scales) == len(biases)

321

for i, bias in enumerate(biases):

322

tens.compressed_values.append(pack_bias_and_scale(bias, *quantised_scales[i]))

323

324

tens.element_size_bytes = 10

325

326

# Figure out if we need padded storage (extra whole elements)

327

padding = (len(tens.compressed_values) * tens.element_size_bytes) % 16

328

if padding != 0:

329

padding = 16 - padding

330

331

# This adds enough padding to allow over-reads

332

while padding > 0:

333

tens.compressed_values.append(pack_bias_and_scale(0, 0, 0))

334

padding = padding - tens.element_size_bytes

335

336

tens.storage_shape = [len(tens.compressed_values)]

337

338

339

def update_pass_weight_and_scale_tensors(nng, arch):

340

def find_npu_usage_of_tensor(tens):

341

# TODO: This function is identical to the one in mark_tensors.py. A common version should be used.

342

for op in tens.consumers():

343

if op.type == "DMA":

344

return find_npu_usage_of_tensor(op.outputs[0])

345

if "npu_block_type" in op.attrs:

346

return op.attrs["npu_block_type"]

347

return NpuBlockType.Default

348

349

for sg in nng.subgraphs:

350

for ps in sg.passes:

351

if ps.weight_tensor != None:

352

npu_usage_of_tensor = find_npu_usage_of_tensor(ps.weight_tensor)

353

if npu_usage_of_tensor == NpuBlockType.ConvolutionDepthWise:

354

ps.weight_tensor.quant_values = np.transpose(ps.weight_tensor.quant_values, (0, 1, 3, 2))

355

ps.weight_tensor.shape = ps.weight_tensor.storage_shape = ps.weight_tensor.bandwidth_shape = list(

356

ps.weight_tensor.quant_values.shape

357

)

358

ps.weight_tensor.weight_transpose_depthwise = True

359

360

needs_dma = len(ps.weight_tensor.ops) == 1 and ps.weight_tensor.ops[0].type == "DMA"

361

if ps.cascade.strategy == SchedulingStrategy.WeightStream and needs_dma:

362

ofm_depth_step = ps.block_config[-1]

363

else:

364

ofm_depth_step = ps.weight_tensor.shape[-1]

compress_weights(

ps.weight_tensor,

arch,

npu_usage_of_tensor,

Block(ps.block_config[-3], ps.block_config[-4], ps.block_config[-1]),

371

ofm_depth_step,

372

)

373

# Update source tensor

374

if len(ps.weight_tensor.ops) == 1 and ps.weight_tensor.ops[0].type == "DMA":

375

src_tens = ps.weight_tensor.ops[0].inputs[0]

376

src_tens.shape = ps.weight_tensor.shape

377

src_tens.weight_transpose_depthwise = ps.weight_tensor.weight_transpose_depthwise

378

src_tens.quant_values = ps.weight_tensor.quant_values

379

src_tens.compressed_values = ps.weight_tensor.compressed_values

380

src_tens.storage_shape = [1, 1, 1, ps.weight_tensor.weight_compressed_offsets[-1]]

381

src_tens.brick_size = ps.weight_tensor.brick_size

382

src_tens.weight_compression_scales = ps.weight_tensor.weight_compression_scales

383

src_tens.weight_compressed_offsets = ps.weight_tensor.weight_compressed_offsets

384

385

if ps.scale_tensor != None:

386

rescale_for_faf = False

387

activation_ops = set(("Sigmoid", "Tanh"))

388

if (ps.ops[-1].type in activation_ops) and (ps.npu_block_type != NpuBlockType.ElementWise):

389

rescale_for_faf = True

390

calc_scales_and_pack_biases(ps.scale_tensor, arch, ps.block_config[3], rescale_for_faf)