blob: 92197248744fe888647be28f1372566328df8d0e [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17
18# Description:
19# Compresses and pads the weigths. It also calculates the scales and packs with the biases.
20
21import os
22import sys
23import enum
24import math
25import numpy as np
26from collections import namedtuple
27from .numeric_util import round_up
Fredrik Svedbergd67c0aa2020-03-30 13:15:28 +020028from .scaling import quantise_scale, reduced_quantise_scale
Tim Hall79d07d22020-04-27 18:20:16 +010029from .tensor import TensorPurpose, TensorSubPurpose, TensorFormat, TensorBlockTraversal
30from .operation import NpuBlockType
31from .architecture_features import Block
32from .nn_graph import SchedulingStrategy
33from .data_type import DataType
34
35from ethosu import mlw_codec
36
37
38def encode(weight_stream):
39 assert np.amin(weight_stream) >= -255
40 assert np.amax(weight_stream) <= 255
41
42 # Encode flattened signed weight stream
43 compressed = mlw_codec.encode(weight_stream)
44
45 # pad with 0xFF as needed so the length of the weight stream
46 # is a multiple of 16
47
48 while (len(compressed) % 16) != 0:
49 compressed.append(0xFF)
50
51 return compressed
52
53
54def generate_brick(arch, brick_weights, ofm_block, block_traversal, ifm_bitdepth):
55 is_depthwise = block_traversal == TensorBlockTraversal.DepthWise
56 is_partkernel = block_traversal == TensorBlockTraversal.PartKernelFirst
57 subkernel_max = arch.subkernel_max
58 ofm_ublock = arch.ofm_ublock
59 ifm_ublock = arch.ifm_ublock
60 # Expect weights formatted HWIO
61 ofm_depth = brick_weights.shape[-1]
62 ifm_depth = brick_weights.shape[-2]
63 kernel_width = brick_weights.shape[-3]
64 kernel_height = brick_weights.shape[-4]
65 # IFM block depth
66 if is_partkernel or (ifm_bitdepth == 16):
67 # IFM block depth is always 16 for part-kernel-first
68 ifm_block_depth = 16
69 elif ifm_bitdepth == 8:
70 ifm_block_depth = 32
71 else:
72 assert False
73
74 stream = []
75
76 # Top level striping - OFM blocks in the entire brick's depth
77 for ofm_block_z in range(0, ofm_depth, ofm_block.depth):
78 clipped_ofm_block_depth = min(ofm_block.depth, ofm_depth - ofm_block_z)
79 # IFM blocks required for the brick
80 for ifm_block_z in range(0, (1 if is_depthwise else ifm_depth), ifm_block_depth):
81 if is_depthwise:
82 clipped_ifm_block_depth = ifm_ublock.depth
83 else:
84 clipped_ifm_block_depth = (
85 min(ifm_block_depth, ifm_depth - ifm_block_z) if is_partkernel else ifm_block_depth
86 )
87 # Weight decomposition
88 # Subkernel Splitting (H)
89 for subkernel_y in range(0, kernel_height, subkernel_max.height):
90 sub_height = min(kernel_height - subkernel_y, subkernel_max.height)
91 # Subkernel splitting (W)
92 for subkernel_x in range(0, kernel_width, subkernel_max.width):
93 sub_width = min(kernel_width - subkernel_x, subkernel_max.width)
94 subkernel_elements = sub_width * sub_height
95 # Part kernel first works across the kernel H/W and needs padding
96 if is_partkernel:
97 if ifm_bitdepth == 16 and subkernel_elements % 2 != 0:
98 subkernel_elements = int(math.ceil(subkernel_elements / 2) * 2)
99 elif ifm_bitdepth == 8 and subkernel_elements % 4 != 0:
100 subkernel_elements = int(math.ceil(subkernel_elements / 4) * 4)
101
102 # Depthwise Conv requires multiple of 4 kernel elements in its weight block
103 # this is different from normal conv which is considered "weights depth-first"
104 elif is_depthwise:
105 subkernel_elements = int(math.ceil(subkernel_elements / 4.0) * 4)
106
107 ifm_block_depth_outer = clipped_ifm_block_depth if is_partkernel else 1
108 ifm_block_depth_inner = 1 if is_partkernel else clipped_ifm_block_depth
109 # IFM Ublocks in IFM-block over depth for part-kernel-first mode
110 # For depth-first IFM Ublocks are traversed after subkernel elements so this loop is ignored.
111 for ifm_ublk_outer in range(0, ifm_block_depth_outer, ifm_ublock.depth):
112 # OFM Ublocks in OFM-block over depth
113 for ofm_ublk in range(0, clipped_ofm_block_depth, ofm_ublock.depth):
114 # HW Kernel element traversal - cannot be a H/W loop due to element
115 # padding requirement on depthwise/part-kernel configurations
116 for element in range(subkernel_elements):
117 kx = element % sub_width
118 ky = element // sub_width
119 # IFM Ublocks in IFM-block over depth (only 1 ublock if depthwise)
120 # In case of part-kernel-first IFM Ublock traversal have already been handled
121 # and this loop is ignored.
122 for ifm_ublk_inner in range(0, ifm_block_depth_inner, ifm_ublock.depth):
123 # Feed OFM ublock elements
124 for ofm_ublock_z in range(ofm_ublock.depth):
125 # Source IFM ublock elements (only 1 element deep if depthwise)
126 for ifm_ublock_z in range(1 if is_depthwise else ifm_ublock.depth):
127 # Source position within the current subkernel
128 wx = subkernel_x + kx
129 wy = subkernel_y + ky
130 # Source IFM/OFM slices
131 ifm_ublk = ifm_ublk_inner + ifm_ublk_outer
132 ifm_z = ifm_block_z + ifm_ublk + ifm_ublock_z
133 ofm_z = ofm_block_z + ofm_ublk + ofm_ublock_z
134 if (ifm_z >= ifm_depth) or (ofm_z >= ofm_depth) or (ky >= sub_height):
135 stream.append(0)
136 else:
137 stream.append(brick_weights[wy][wx][ifm_z][ofm_z])
138 return stream
139
140
141# Compress the weights
142def compress_weights(tens, arch, npu_block_type, ofm_block, ofm_depth_step, min_val=None, max_val=None):
143 assert tens.purpose == TensorPurpose.Weights
144 assert tens.format == TensorFormat.WeightsCompressed
145
146 WeightCompressionConfig = namedtuple("WeightCompressionConfig", ["npu_block_type", "ofm_block", "ofm_depth_step"])
147
148 # check if weights have already been compressed
149 wcc = tens.weight_compression_config
150 if wcc is not None:
151 assert wcc.npu_block_type == npu_block_type, "Weights not used by the same operator type"
152
153 if wcc.ofm_block == ofm_block and wcc.ofm_depth_step == ofm_depth_step:
154 return
155
156 assert tens.quantization is not None
157 assert tens.quantization.scale_f32 is not None
158 assert tens.quantization.zero_point is not None
159
160 zero_point = tens.quantization.zero_point
161 quant_buf = tens.quant_values.astype(np.int64)
162
163 # Early zero-point correction
164 weights = quant_buf - zero_point
165
166 if len(weights.shape) == 2:
167 weights = np.expand_dims(np.expand_dims(weights, axis=0), axis=0)
168 weights_shape = (weights.shape[0], 1, 1, weights.shape[1])
169 else:
170 weights_shape = weights.shape
171
172 compression_scales = []
173 compressed_offsets = []
174 encoded_streams = []
175 offset = 0
176 max_single_buffer_len = 0
177
178 ifm_bitdepth = tens.consumer_list[0].inputs[0].dtype.size_in_bits()
179 ifm_depth = weights.shape[-2]
180 if npu_block_type == NpuBlockType.ConvolutionDepthWise:
181 tens.block_traversal = TensorBlockTraversal.DepthWise
182 if npu_block_type == NpuBlockType.ConvolutionMxN:
183 # Determine which block traversal strategy has better DPU utilization
184 kernel_size = weights_shape[0] * weights_shape[1]
185 depth_utilization = weights_shape[2] / round_up(weights_shape[2], 32 if ifm_bitdepth == 8 else 16)
186 part_kernel_utilization = (weights_shape[2] / round_up(weights_shape[2], 8)) * (
187 kernel_size / round_up(kernel_size, 4 if ifm_bitdepth == 8 else 2)
188 )
189 if part_kernel_utilization >= depth_utilization or ifm_depth <= 8:
190 # Part-kernel first is always better for ifm depths <= 8
191 tens.block_traversal = TensorBlockTraversal.PartKernelFirst
192 else:
193 tens.block_traversal = TensorBlockTraversal.DepthFirst
194
195 # Slice weight stream up depth-ways into bricks and compress
196 full_ofm_depth = quant_buf.shape[-1]
197 for idx in range(0, full_ofm_depth, ofm_depth_step):
198 # Get the weights necessary for this brick
199 count = min(full_ofm_depth - idx, ofm_depth_step)
200 brick_weights = weights[:, :, :, idx : idx + count]
201
202 # Encode all weights into one chunk
203 raw_stream = generate_brick(arch, brick_weights, ofm_block, tens.block_traversal, ifm_bitdepth)
204 encoded = encode(raw_stream)
205 encoded_streams.append(encoded)
206
207 # Remember maximum encoded length for DoubleBuffering
208 if max_single_buffer_len < len(encoded):
209 max_single_buffer_len = len(encoded)
210
211 # Remember where we put it for linear addressing
212 compressed_offsets.append(offset)
213 offset += len(encoded)
214 assert offset % 16 == 0
215
216 # Compression scale tracking
217 compression_scales.append(len(encoded) / len(raw_stream))
218
219 # Also track complete length in the offsets array
220 compressed_offsets.append(offset)
221
222 if tens.sub_purpose == TensorSubPurpose.DoubleBuffer and len(encoded_streams) > 2:
223 offset = 2 * max_single_buffer_len
224 assert offset % 16 == 0
225
226 tens.storage_shape = [1, 1, 1, offset]
227 tens.weight_compression_scales = compression_scales
228 tens.weight_compression_config = WeightCompressionConfig(npu_block_type, ofm_block, ofm_depth_step)
229 tens.weight_compressed_offsets = compressed_offsets
230 tens.compression_scale_for_worst_weight_stream = np.amax(compression_scales)
231 tens.storage_compression_scale = tens.bandwidth_compression_scale = np.average(compression_scales)
232 tens.compressed_values = encoded_streams
233 tens.brick_size = (weights_shape[0], weights_shape[1], weights_shape[2], min(tens.shape[-1], ofm_depth_step))
234
235
236def calc_scales_and_pack_biases(tens, arch, oc_quantum, rescale_for_faf=False):
237 assert tens.purpose == TensorPurpose.FeatureMap
238 assert tens.format == TensorFormat.NHWC
239 # the connected operator should expect a bias input unless it is a FullyConnected
240 assert "Bias" in tens.consumer_list[0].type or tens.consumer_list[0].type.startswith("FullyConnected")
241 # the input bias tensor is the same as that connected to the operator
242 assert tens is tens.consumer_list[0].inputs[2]
243 # the operator should only have a single output
244 assert len(tens.consumer_list[0].outputs) == 1
245
246 def pack_bias_and_scale(bias, scale, shift):
247 bias = np.int64(bias)
248 assert -(1 << (40 - 1)) <= bias < (1 << (40 - 1)) # signed 40-bit range
249 assert 0 <= scale < (1 << 32) # unsigned 32-bit range
250 assert 0 <= shift < (1 << 6) # unsigned 6-bit range
251
252 # pack the 80 bit value = [0(2-bits),shift(6-bits),scale(32-bits),bias(40-bits)]
253 data = bytearray(10)
254 data[0] = (bias >> (0 * 8)) & 0xFF
255 data[1] = (bias >> (1 * 8)) & 0xFF
256 data[2] = (bias >> (2 * 8)) & 0xFF
257 data[3] = (bias >> (3 * 8)) & 0xFF
258 data[4] = (bias >> (4 * 8)) & 0xFF
259 data[5] = (scale >> (0 * 8)) & 0xFF
260 data[6] = (scale >> (1 * 8)) & 0xFF
261 data[7] = (scale >> (2 * 8)) & 0xFF
262 data[8] = (scale >> (3 * 8)) & 0xFF
263 data[9] = shift & 0x3F
264 return data
265
266 biases = tens.quant_values
267
268 first_consumer_op = tens.consumer_list[0]
269 ifm_dtype = first_consumer_op.inputs[0].dtype
270 ifm_scale = first_consumer_op.inputs[0].quantization.scale_f32
271 ofm_scale = first_consumer_op.outputs[0].quantization.scale_f32
272 weight_scales = first_consumer_op.inputs[1].quantization.scale_f32
273
274 # biases can have multiple consumers for rnn cells. if so, then check that they are all the same
275 for op in tens.consumer_list[1:]:
276 assert ifm_scale == op.inputs[0].quantization.scale_f32
277 assert ofm_scale == op.outputs[0].quantization.scale_f32
278 assert weight_scales == op.inputs[1].quantization.scale_f32
279
280 if not hasattr(weight_scales, "__iter__"):
281 # If weight_scales is not already an iterable make it into a list
282 weight_scales = [weight_scales]
283
284 # Convert scales to np.double (from np.float32) to conform to TensorFlow Lite which
285 # uses double during scaling calculations
286 # TensorFlow Lite casts the scales slightly differently for uint8 and int8
287 if not rescale_for_faf:
288 if ifm_dtype == DataType.uint8:
289 scales = [np.double(ifm_scale * weight_scale) / np.double(ofm_scale) for weight_scale in weight_scales]
Fredrik Svedbergd67c0aa2020-03-30 13:15:28 +0200290 elif ifm_dtype == DataType.int8 or ifm_dtype == DataType.int16:
Tim Hall79d07d22020-04-27 18:20:16 +0100291 scales = [
292 (np.double(ifm_scale) * np.double(weight_scale)) / np.double(ofm_scale)
293 for weight_scale in weight_scales
294 ]
295 else:
296 assert False, str(ifm_dtype) + " not implemented"
297 else:
298 if ifm_dtype == DataType.uint8:
299 scales = [np.double(ifm_scale * weight_scale * 0x3000) for weight_scale in weight_scales]
Fredrik Svedbergd67c0aa2020-03-30 13:15:28 +0200300 elif ifm_dtype == DataType.int8 or ifm_dtype == DataType.int16:
Tim Hall79d07d22020-04-27 18:20:16 +0100301 scales = [(np.double(ifm_scale * 0x3000) * np.double(weight_scale)) for weight_scale in weight_scales]
302 else:
303 assert False, str(ifm_dtype) + " not implemented"
304
305 # quantise all of the weight scales into (scale_factor, shift)
Fredrik Svedbergd67c0aa2020-03-30 13:15:28 +0200306 if ifm_dtype == DataType.int16:
307 quantised_scales = [reduced_quantise_scale(scale) for scale in scales]
308 else:
309 quantised_scales = [quantise_scale(scale) for scale in scales]
Tim Hall79d07d22020-04-27 18:20:16 +0100310
311 for _, shift in quantised_scales:
312 assert shift >= 16
313
314 # pack the biases and scales
315 tens.compressed_values = []
316 if len(quantised_scales) == 1:
317 # If only 1 quantised scale is used, repeat that value for the length of the biases
318 quantised_scales = [quantised_scales[0]] * len(biases)
319
320 assert len(quantised_scales) == len(biases)
321 for i, bias in enumerate(biases):
322 tens.compressed_values.append(pack_bias_and_scale(bias, *quantised_scales[i]))
323
324 tens.element_size_bytes = 10
325
326 # Figure out if we need padded storage (extra whole elements)
327 padding = (len(tens.compressed_values) * tens.element_size_bytes) % 16
328 if padding != 0:
329 padding = 16 - padding
330
331 # This adds enough padding to allow over-reads
332 while padding > 0:
333 tens.compressed_values.append(pack_bias_and_scale(0, 0, 0))
334 padding = padding - tens.element_size_bytes
335
336 tens.storage_shape = [len(tens.compressed_values)]
337
338
339def update_pass_weight_and_scale_tensors(nng, arch):
340 def find_npu_usage_of_tensor(tens):
341 # TODO: This function is identical to the one in mark_tensors.py. A common version should be used.
342 for op in tens.consumers():
343 if op.type == "DMA":
344 return find_npu_usage_of_tensor(op.outputs[0])
345 if "npu_block_type" in op.attrs:
346 return op.attrs["npu_block_type"]
347 return NpuBlockType.Default
348
349 for sg in nng.subgraphs:
350 for ps in sg.passes:
351 if ps.weight_tensor != None:
352 npu_usage_of_tensor = find_npu_usage_of_tensor(ps.weight_tensor)
353 if npu_usage_of_tensor == NpuBlockType.ConvolutionDepthWise:
354 ps.weight_tensor.quant_values = np.transpose(ps.weight_tensor.quant_values, (0, 1, 3, 2))
355 ps.weight_tensor.shape = ps.weight_tensor.storage_shape = ps.weight_tensor.bandwidth_shape = list(
356 ps.weight_tensor.quant_values.shape
357 )
358 ps.weight_tensor.weight_transpose_depthwise = True
359
360 needs_dma = len(ps.weight_tensor.ops) == 1 and ps.weight_tensor.ops[0].type == "DMA"
361 if ps.cascade.strategy == SchedulingStrategy.WeightStream and needs_dma:
362 ofm_depth_step = ps.block_config[-1]
363 else:
364 ofm_depth_step = ps.weight_tensor.shape[-1]
365
366 compress_weights(
367 ps.weight_tensor,
368 arch,
369 npu_usage_of_tensor,
370 Block(ps.block_config[-3], ps.block_config[-4], ps.block_config[-1]),
371 ofm_depth_step,
372 )
373 # Update source tensor
374 if len(ps.weight_tensor.ops) == 1 and ps.weight_tensor.ops[0].type == "DMA":
375 src_tens = ps.weight_tensor.ops[0].inputs[0]
376 src_tens.shape = ps.weight_tensor.shape
377 src_tens.weight_transpose_depthwise = ps.weight_tensor.weight_transpose_depthwise
378 src_tens.quant_values = ps.weight_tensor.quant_values
379 src_tens.compressed_values = ps.weight_tensor.compressed_values
380 src_tens.storage_shape = [1, 1, 1, ps.weight_tensor.weight_compressed_offsets[-1]]
381 src_tens.brick_size = ps.weight_tensor.brick_size
382 src_tens.weight_compression_scales = ps.weight_tensor.weight_compression_scales
383 src_tens.weight_compressed_offsets = ps.weight_tensor.weight_compressed_offsets
384
385 if ps.scale_tensor != None:
386 rescale_for_faf = False
387 activation_ops = set(("Sigmoid", "Tanh"))
388 if (ps.ops[-1].type in activation_ops) and (ps.npu_block_type != NpuBlockType.ElementWise):
389 rescale_for_faf = True
390 calc_scales_and_pack_biases(ps.scale_tensor, arch, ps.block_config[3], rescale_for_faf)