erik.andersson@arm.com | 460c689 | 2021-02-24 14:38:09 +0100 | [diff] [blame] | 1 | # Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved. |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 2 | # |
| 3 | # SPDX-License-Identifier: Apache-2.0 |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the License); you may |
| 6 | # not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an AS IS BASIS, WITHOUT |
| 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 16 | # Description: |
| 17 | # Compresses and pads the weigths. It also calculates the scales and packs with the biases. |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 18 | from collections import namedtuple |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 19 | from collections import OrderedDict |
Louis Verhaard | aeae567 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 20 | from typing import Tuple |
Diego Russo | ea6111a | 2020-04-14 18:41:58 +0100 | [diff] [blame] | 21 | |
| 22 | import numpy as np |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 23 | |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 24 | from .api import NpuBlockTraversal |
Manupa Karunaratne | d83d2e1 | 2020-07-20 12:05:32 +0100 | [diff] [blame] | 25 | from .architecture_features import Accelerator |
| 26 | from .architecture_features import ArchitectureFeatures |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 27 | from .data_type import DataType |
Louis Verhaard | 7db7896 | 2020-05-25 15:05:26 +0200 | [diff] [blame] | 28 | from .errors import UnsupportedFeatureError |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 29 | from .numeric_util import round_up |
| 30 | from .operation import NpuBlockType |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 31 | from .operation import Op |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 32 | from .scaling import quantise_scale |
| 33 | from .scaling import reduced_quantise_scale |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 34 | from .tensor import Tensor |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 35 | from .tensor import TensorFormat |
| 36 | from .tensor import TensorPurpose |
Jacob Bohlin | e843d33 | 2020-06-23 12:12:56 +0200 | [diff] [blame] | 37 | from ethosu import mlw_codec |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 38 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 39 | |
Louis Verhaard | 3c07c97 | 2020-05-07 08:12:58 +0200 | [diff] [blame] | 40 | # Contains meta info for a weight compression. If two tensors have identical weight compression config, |
| 41 | # then they also will have identical compressed weights. |
| 42 | WeightCompressionConfig = namedtuple( |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 43 | "WeightCompressionConfig", ["npu_block_type", "ofm_block_depth", "ofm_depth_step", "dilation", "weight_value_id"], |
Louis Verhaard | 3c07c97 | 2020-05-07 08:12:58 +0200 | [diff] [blame] | 44 | ) |
| 45 | |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 46 | ScaleCompressionConfig = namedtuple("ScaleCompressionConfig", ["scale_value_id", "ifm_scale", "ofm_scale"]) |
| 47 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 48 | WeightKey = namedtuple("WeightKey", ["core", "depth"]) |
| 49 | |
| 50 | |
| 51 | class WeightRange: |
| 52 | def __init__(self): |
| 53 | self.offset = 0 |
| 54 | self.scale_bytes = 0 |
| 55 | self.weight_offset = 0 |
| 56 | self.weight_bytes = 0 |
| 57 | self.index = 0 |
| 58 | |
| 59 | @property |
| 60 | def total_bytes(self): |
| 61 | return self.scale_bytes + self.weight_bytes |
| 62 | |
| 63 | |
| 64 | class NpuWeightTensor(Tensor): |
| 65 | def __init__(self, name): |
| 66 | Tensor.__init__(self, None, None, name + "_npu_encoded_weights") |
| 67 | self.buffer = [] |
| 68 | self.max_range_bytes = 0 |
| 69 | self.encoded_ranges = OrderedDict() |
| 70 | self.hw_traversal = NpuBlockTraversal.DEPTH_FIRST |
| 71 | self.dtype = DataType.uint8 |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 72 | self.scale_compression_config = None |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 73 | |
| 74 | |
| 75 | class CompressedWeightCache: |
| 76 | """Global tensor weight compression cache""" |
| 77 | |
| 78 | cache = {} |
| 79 | |
| 80 | @staticmethod |
| 81 | def get_tensor_with_same_compression(wcc): |
| 82 | return CompressedWeightCache.cache.get(wcc) |
| 83 | |
| 84 | @staticmethod |
| 85 | def add(tens): |
| 86 | # Adds the compressed weights from the tensor to the cache |
| 87 | wcc = tens.weight_compression_config |
| 88 | CompressedWeightCache.cache[wcc] = tens |
| 89 | |
| 90 | @staticmethod |
| 91 | def has_tensor_with_same_compression(wcc): |
| 92 | return wcc in CompressedWeightCache.cache |
| 93 | |
| 94 | @staticmethod |
| 95 | def get_unencoded_size_with_same_compression(wcc): |
| 96 | cache_obj = CompressedWeightCache.cache.get(wcc) |
| 97 | return cache_obj[1] if cache_obj else None |
| 98 | |
| 99 | |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 100 | def create_weight_compression_config(weight_tens, npu_block_type, ofm_block_depth, ofm_depth_step, dilation): |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 101 | # Note: for an ofm block only its depth is used in weight compression. |
| 102 | # And block depth > ofm depth gives same result as block depth == ofm depth |
James Peet | 7519d50 | 2021-07-19 16:47:58 +0100 | [diff] [blame] | 103 | block_depth = min(ofm_block_depth, weight_tens.values.shape[-1]) |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 104 | return WeightCompressionConfig(npu_block_type, block_depth, ofm_depth_step, dilation, weight_tens.value_id) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 105 | |
Louis Verhaard | 3c07c97 | 2020-05-07 08:12:58 +0200 | [diff] [blame] | 106 | |
Manupa Karunaratne | d83d2e1 | 2020-07-20 12:05:32 +0100 | [diff] [blame] | 107 | def encode_weights( |
| 108 | accelerator: Accelerator, |
| 109 | weights_volume: np.ndarray, |
Louis Verhaard | aeae567 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 110 | dilation_xy: Tuple[int, int], |
Manupa Karunaratne | d83d2e1 | 2020-07-20 12:05:32 +0100 | [diff] [blame] | 111 | ifm_bitdepth: int, |
| 112 | ofm_block_depth: int, |
| 113 | is_depthwise: bool, |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 114 | block_traversal: NpuBlockTraversal, |
Manupa Karunaratne | d83d2e1 | 2020-07-20 12:05:32 +0100 | [diff] [blame] | 115 | ): |
| 116 | """ |
Louis Verhaard | aeae567 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 117 | Internal implementation of the public facing API to use weight encoding. |
Manupa Karunaratne | d83d2e1 | 2020-07-20 12:05:32 +0100 | [diff] [blame] | 118 | |
Tim Hall | c8a7386 | 2020-10-27 12:43:14 +0000 | [diff] [blame] | 119 | :param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator |
Manupa Karunaratne | d83d2e1 | 2020-07-20 12:05:32 +0100 | [diff] [blame] | 120 | :param weights_volume: numpy.ndarray in OHWI layout with a shape of four |
| 121 | :param dilation_xy: a two element tuple of dilation attributes in x,y dimension |
| 122 | :param ifm_bitdepth: the bitdepth of input feature map |
Tim Hall | c8a7386 | 2020-10-27 12:43:14 +0000 | [diff] [blame] | 123 | :param ofm_block_depth: the depth of blocks for Ethos-U processing |
Manupa Karunaratne | d83d2e1 | 2020-07-20 12:05:32 +0100 | [diff] [blame] | 124 | :param is_depthwise: a boolean indicating these weights are used for a depthwise traversal |
Louis Verhaard | aeae567 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 125 | :param block_traversal: indicates how these weights are traversed on sub-kernel basis |
| 126 | |
Fredrik Svedberg | f5c07c4 | 2021-04-23 14:36:42 +0200 | [diff] [blame] | 127 | :return: a tuple with a bytearray of encoded weights and the size of the unencoded weights |
Manupa Karunaratne | d83d2e1 | 2020-07-20 12:05:32 +0100 | [diff] [blame] | 128 | """ |
Manupa Karunaratne | 8b24f2b | 2020-08-12 18:26:39 +0000 | [diff] [blame] | 129 | # Check arg types |
| 130 | assert isinstance(accelerator, Accelerator) |
| 131 | assert isinstance(weights_volume, np.ndarray) |
| 132 | assert isinstance(dilation_xy, tuple) |
| 133 | assert isinstance(ifm_bitdepth, int) |
| 134 | assert isinstance(ofm_block_depth, int) |
| 135 | assert isinstance(is_depthwise, bool) |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 136 | assert isinstance(block_traversal, NpuBlockTraversal) |
Manupa Karunaratne | 8b24f2b | 2020-08-12 18:26:39 +0000 | [diff] [blame] | 137 | |
Manupa Karunaratne | d83d2e1 | 2020-07-20 12:05:32 +0100 | [diff] [blame] | 138 | # Checks for weight layout |
| 139 | assert len(weights_volume.shape) == 4, "weights ndarray should have a shape of 4" |
| 140 | |
| 141 | # It cannot be both partkernel and depthwise |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 142 | assert not ( |
| 143 | is_depthwise and block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST |
| 144 | ), "encode_weights :: partkernel and depthwise are mutually exclusive" |
Manupa Karunaratne | d83d2e1 | 2020-07-20 12:05:32 +0100 | [diff] [blame] | 145 | |
| 146 | # Check valid values for dilation |
| 147 | assert dilation_xy[0] in (1, 2), "encode_weights :: dilation x should be 1 or 2 not {}".format(dilation_xy[0]) |
| 148 | assert dilation_xy[1] in (1, 2), "encode_weights :: dilation y should be 1 or 2 not {}".format(dilation_xy[1]) |
| 149 | |
| 150 | ifm_ublock = ArchitectureFeatures.accelerator_configs[accelerator].ifm_ublock |
| 151 | ofm_ublock = ArchitectureFeatures.accelerator_configs[accelerator].ofm_ublock |
James Peet | c244982 | 2021-07-19 17:09:16 +0100 | [diff] [blame] | 152 | decomp_h = ArchitectureFeatures.SubKernelMax.height // dilation_xy[1] |
| 153 | decomp_w = ArchitectureFeatures.SubKernelMax.width // dilation_xy[0] |
Mauricio Briceno | 67e11f7 | 2021-05-05 12:47:28 +0200 | [diff] [blame] | 154 | |
| 155 | return mlw_codec.reorder_encode( |
| 156 | ifm_ublock.depth, |
| 157 | ofm_ublock.depth, |
| 158 | weights_volume, |
| 159 | ofm_block_depth, |
| 160 | is_depthwise, |
| 161 | block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST, |
| 162 | ifm_bitdepth, |
| 163 | decomp_h, |
| 164 | decomp_w, |
Manupa Karunaratne | d83d2e1 | 2020-07-20 12:05:32 +0100 | [diff] [blame] | 165 | ) |
Manupa Karunaratne | d83d2e1 | 2020-07-20 12:05:32 +0100 | [diff] [blame] | 166 | |
| 167 | |
Manupa Karunaratne | bef228b | 2020-07-29 18:06:28 +0100 | [diff] [blame] | 168 | def encode_bias(bias: np.int64, scale: int, shift: int): |
| 169 | """ |
Louis Verhaard | aeae567 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 170 | Internal implementation of public facing API to pack bias and scale values as required by the Ethos-U |
Tim Hall | c8a7386 | 2020-10-27 12:43:14 +0000 | [diff] [blame] | 171 | |
Manupa Karunaratne | bef228b | 2020-07-29 18:06:28 +0100 | [diff] [blame] | 172 | :param bias: 64bit signed number that includes 40bit signed bias |
| 173 | :param scale: 32bit scale value |
| 174 | :param shift: 6bit shift value |
| 175 | :return: packed 80bit [0(2-bits),shift(6-bits),scale(32-bits),bias(40-bits)] |
| 176 | """ |
Manupa Karunaratne | 8b24f2b | 2020-08-12 18:26:39 +0000 | [diff] [blame] | 177 | # Check arg types |
| 178 | assert isinstance(bias, np.int64) |
| 179 | assert isinstance(scale, int) |
| 180 | assert isinstance(shift, int) |
| 181 | |
Manupa Karunaratne | bef228b | 2020-07-29 18:06:28 +0100 | [diff] [blame] | 182 | assert -(1 << (40 - 1)) <= bias < (1 << (40 - 1)) # signed 40-bit range |
| 183 | assert 0 <= scale < (1 << 32) # unsigned 32-bit range |
| 184 | assert 0 <= shift < (1 << 6) # unsigned 6-bit range |
| 185 | |
| 186 | data = bytearray(10) |
| 187 | data[0] = (bias >> (0 * 8)) & 0xFF |
| 188 | data[1] = (bias >> (1 * 8)) & 0xFF |
| 189 | data[2] = (bias >> (2 * 8)) & 0xFF |
| 190 | data[3] = (bias >> (3 * 8)) & 0xFF |
| 191 | data[4] = (bias >> (4 * 8)) & 0xFF |
| 192 | data[5] = (scale >> (0 * 8)) & 0xFF |
| 193 | data[6] = (scale >> (1 * 8)) & 0xFF |
| 194 | data[7] = (scale >> (2 * 8)) & 0xFF |
| 195 | data[8] = (scale >> (3 * 8)) & 0xFF |
| 196 | data[9] = shift & 0x3F |
| 197 | return data |
| 198 | |
| 199 | |
Tim Hall | f7e810a | 2020-06-25 15:04:31 +0100 | [diff] [blame] | 200 | def core_deinterleave(hwio, core, ncores): |
| 201 | # Put weights back into OHWI |
Jacob Bohlin | e843d33 | 2020-06-23 12:12:56 +0200 | [diff] [blame] | 202 | ohwi = np.transpose(hwio, (3, 0, 1, 2)) |
| 203 | return ohwi[core : ohwi.shape[0] : ncores] |
| 204 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 205 | |
Patrik Gustavsson | 8f1f9aa | 2021-06-28 07:41:58 +0200 | [diff] [blame] | 206 | def _prepare_scale_and_bias(arch, tens, rescale_for_faf, explicit_scaling): |
Andreas Nevalainen | 897cc14 | 2020-10-28 15:42:08 +0100 | [diff] [blame] | 207 | assert tens.purpose in [TensorPurpose.FeatureMap, TensorPurpose.FSBias] |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 208 | assert tens.format == TensorFormat.NHWC |
| 209 | # the connected operator should expect a bias input unless it is a FullyConnected |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 210 | assert tens.consumer_list[0].type.needs_bias() |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 211 | # the input bias tensor is the same as that connected to the operator |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 212 | bias_tens = tens.consumer_list[0].bias |
Jacob Bohlin | cf7da10 | 2020-05-20 09:03:40 +0200 | [diff] [blame] | 213 | assert tens is bias_tens |
| 214 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 215 | # the operator should only have a single output |
| 216 | assert len(tens.consumer_list[0].outputs) == 1 |
James Peet | 7519d50 | 2021-07-19 16:47:58 +0100 | [diff] [blame] | 217 | biases = tens.values |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 218 | |
| 219 | first_consumer_op = tens.consumer_list[0] |
| 220 | ifm_dtype = first_consumer_op.inputs[0].dtype |
Dwight Lidman | 4f728c0 | 2020-12-17 15:14:45 +0100 | [diff] [blame] | 221 | ifm_scale = first_consumer_op.get_input_quantization().scale_f32 |
Louis Verhaard | 98a3499 | 2020-09-01 10:39:04 +0200 | [diff] [blame] | 222 | ofm_scale = first_consumer_op.get_output_quantization().scale_f32 |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 223 | weight_scales = first_consumer_op.inputs[1].quantization.scale_f32 |
| 224 | |
| 225 | # biases can have multiple consumers for rnn cells. if so, then check that they are all the same |
| 226 | for op in tens.consumer_list[1:]: |
Dwight Lidman | 4f728c0 | 2020-12-17 15:14:45 +0100 | [diff] [blame] | 227 | assert ifm_scale == op.get_input_quantization().scale_f32 |
Louis Verhaard | 98a3499 | 2020-09-01 10:39:04 +0200 | [diff] [blame] | 228 | assert ofm_scale == op.get_output_quantization().scale_f32 |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 229 | assert weight_scales == op.inputs[1].quantization.scale_f32 |
| 230 | |
| 231 | if not hasattr(weight_scales, "__iter__"): |
| 232 | # If weight_scales is not already an iterable make it into a list |
| 233 | weight_scales = [weight_scales] |
| 234 | |
| 235 | # Convert scales to np.double (from np.float32) to conform to TensorFlow Lite which |
| 236 | # uses double during scaling calculations |
| 237 | # TensorFlow Lite casts the scales slightly differently for uint8 and int8 |
| 238 | if not rescale_for_faf: |
| 239 | if ifm_dtype == DataType.uint8: |
Dwight Lidman | 4f728c0 | 2020-12-17 15:14:45 +0100 | [diff] [blame] | 240 | # for some cases of the Mean operator, the scale must be calculated differently to match reference |
| 241 | if first_consumer_op.low_precision_scaling: |
| 242 | scales = [ |
| 243 | np.double(np.single(ifm_scale) / (np.single(weight_scale) * np.single(ofm_scale))) |
| 244 | for weight_scale in weight_scales |
| 245 | ] |
| 246 | else: |
| 247 | scales = [np.double(ifm_scale * weight_scale) / np.double(ofm_scale) for weight_scale in weight_scales] |
Fredrik Svedberg | d67c0aa | 2020-03-30 13:15:28 +0200 | [diff] [blame] | 248 | elif ifm_dtype == DataType.int8 or ifm_dtype == DataType.int16: |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 249 | scales = [ |
| 250 | (np.double(ifm_scale) * np.double(weight_scale)) / np.double(ofm_scale) |
| 251 | for weight_scale in weight_scales |
| 252 | ] |
| 253 | else: |
Michael McGeagh | 7a6f843 | 2020-12-02 15:29:22 +0000 | [diff] [blame] | 254 | raise UnsupportedFeatureError(f"Compression of {ifm_dtype} is not implemented; Tensor: '{tens.name}'") |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 255 | else: |
| 256 | if ifm_dtype == DataType.uint8: |
| 257 | scales = [np.double(ifm_scale * weight_scale * 0x3000) for weight_scale in weight_scales] |
Fredrik Svedberg | d67c0aa | 2020-03-30 13:15:28 +0200 | [diff] [blame] | 258 | elif ifm_dtype == DataType.int8 or ifm_dtype == DataType.int16: |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 259 | scales = [(np.double(ifm_scale * 0x3000) * np.double(weight_scale)) for weight_scale in weight_scales] |
| 260 | else: |
Michael McGeagh | 7a6f843 | 2020-12-02 15:29:22 +0000 | [diff] [blame] | 261 | raise UnsupportedFeatureError(f"Compression of {ifm_dtype} is not implemented; Tensor: '{tens.name}'") |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 262 | |
Patrik Gustavsson | 8f1f9aa | 2021-06-28 07:41:58 +0200 | [diff] [blame] | 263 | if explicit_scaling: |
| 264 | assert len(explicit_scaling.shift) == len(explicit_scaling.multiplier) |
| 265 | quantised_scales = [(int(m), int(s)) for s, m in zip(explicit_scaling.shift, explicit_scaling.multiplier)] |
Fredrik Svedberg | d67c0aa | 2020-03-30 13:15:28 +0200 | [diff] [blame] | 266 | else: |
Patrik Gustavsson | 8f1f9aa | 2021-06-28 07:41:58 +0200 | [diff] [blame] | 267 | # quantise all of the weight scales into (scale_factor, shift) |
| 268 | if ifm_dtype == DataType.int16: |
| 269 | quantised_scales = [reduced_quantise_scale(scale) for scale in scales] |
| 270 | else: |
| 271 | quantised_scales = [quantise_scale(scale) for scale in scales] |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 272 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 273 | # If only 1 quantised scale is used, repeat that value for the length of the biases |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 274 | if len(quantised_scales) == 1: |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 275 | quantised_scales = [quantised_scales[0]] * len(biases) |
| 276 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 277 | return quantised_scales, biases |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 278 | |
Jacob Bohlin | e843d33 | 2020-06-23 12:12:56 +0200 | [diff] [blame] | 279 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 280 | def encode_weight_and_scale_tensor( |
| 281 | arch, op, weight_tens, scale_tens, kernel, block_config, depth_offsets, rescale_for_faf=False |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 282 | ) -> (NpuWeightTensor, NpuWeightTensor): |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 283 | npu_block_type = op.type.npu_block_type |
| 284 | |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 285 | ifm_scale = scale_tens and scale_tens.consumer_list[0].get_input_quantization().scale_f32 |
| 286 | ofm_scale = scale_tens and scale_tens.consumer_list[0].get_output_quantization().scale_f32 |
| 287 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 288 | wcc = create_weight_compression_config( |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 289 | weight_tens, npu_block_type, block_config.ofm_block.depth, hash(str(depth_offsets)), kernel.dilation |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 290 | ) |
| 291 | |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 292 | scc = ScaleCompressionConfig(scale_tens and scale_tens.value_id, ifm_scale, ofm_scale) |
| 293 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 294 | tens_cached = CompressedWeightCache.get_tensor_with_same_compression(wcc) |
| 295 | if tens_cached is not None: |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 296 | if tens_cached.scale_compression_config == scc: |
| 297 | return tens_cached, None |
| 298 | npu_tensor = NpuWeightTensor(scale_tens.name) |
| 299 | do_weights = False |
| 300 | do_scales = True |
| 301 | else: |
| 302 | npu_tensor = NpuWeightTensor(weight_tens.name) |
| 303 | do_weights = True |
| 304 | do_scales = True |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 305 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 306 | npu_tensor.weight_compression_config = wcc |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 307 | npu_tensor.scale_compression_config = scc |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 308 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 309 | # Ensure depth offsets are terminated at end of OFM shape |
| 310 | assert len(depth_offsets) > 1, "Require closed depth ranges" |
| 311 | |
| 312 | ifm_bitdepth = op.inputs[0].dtype.size_in_bits() |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 313 | |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 314 | # No cache hit, need to perform the encoding |
| 315 | if do_weights: |
| 316 | assert weight_tens.quantization is not None |
Patrik Gustavsson | b081d67 | 2021-08-25 13:49:25 +0200 | [diff] [blame^] | 317 | assert weight_tens.quantization.scale_f32 is not None or op.explicit_scaling |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 318 | assert weight_tens.quantization.zero_point is not None |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 319 | |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 320 | # Early zero-point correction |
James Peet | 7519d50 | 2021-07-19 16:47:58 +0100 | [diff] [blame] | 321 | quant_buf = weight_tens.values.astype(np.int16) |
Tim Hall | b279844 | 2021-06-24 19:31:38 +0100 | [diff] [blame] | 322 | # the zero point can be either a native or numpy type |
| 323 | if isinstance(weight_tens.quantization.zero_point, (int, float)): |
| 324 | zero_point = np.int16(weight_tens.quantization.zero_point) |
| 325 | else: |
| 326 | zero_point = weight_tens.quantization.zero_point.astype(np.int16) |
| 327 | weights = quant_buf - zero_point |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 328 | |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 329 | if len(weights.shape) == 2: |
| 330 | weights = np.expand_dims(np.expand_dims(weights, axis=0), axis=0) |
| 331 | |
| 332 | # Expect this (undilated) equivalence |
| 333 | assert kernel.height == weights.shape[0] |
| 334 | assert kernel.width == weights.shape[1] |
| 335 | |
| 336 | ifm_depth = weights.shape[-2] |
| 337 | |
| 338 | # Default HW traversal |
| 339 | npu_tensor.hw_traversal = NpuBlockTraversal.DEPTH_FIRST |
| 340 | |
| 341 | if npu_block_type == NpuBlockType.ConvolutionMxN: |
| 342 | # Determine which block traversal strategy has better DPU utilization |
| 343 | kernel_size = weights.shape[0] * weights.shape[1] |
| 344 | depth_utilization = weights.shape[2] / round_up(weights.shape[2], 32 if ifm_bitdepth == 8 else 16) |
| 345 | part_kernel_utilization = (weights.shape[2] / round_up(weights.shape[2], 8)) * ( |
| 346 | kernel_size / round_up(kernel_size, 4 if ifm_bitdepth == 8 else 2) |
| 347 | ) |
| 348 | if part_kernel_utilization >= depth_utilization or ifm_depth <= 8: |
| 349 | # Part-kernel first is always better for ifm depths <= 8 |
| 350 | npu_tensor.hw_traversal = NpuBlockTraversal.PART_KERNEL_FIRST |
| 351 | |
| 352 | if op.type == Op.Conv2DBackpropInputSwitchedBias: |
| 353 | # Transpose Convoluion, reverse weights in H and W axes |
| 354 | weights = np.flip(weights, axis=(0, 1)) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 355 | |
| 356 | encoded_stream = bytearray() |
| 357 | max_single_buffer_len = 0 |
| 358 | is_depthwise = npu_block_type == NpuBlockType.ConvolutionDepthWise |
| 359 | |
| 360 | # Bias & scale |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 361 | if do_scales: |
Patrik Gustavsson | 8f1f9aa | 2021-06-28 07:41:58 +0200 | [diff] [blame] | 362 | quantised_scales, biases = _prepare_scale_and_bias(arch, scale_tens, rescale_for_faf, op.explicit_scaling) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 363 | scale_tens.element_size_bytes = 10 |
| 364 | |
| 365 | # Slice the weight stream up depth-ways into bricks and compress |
James Peet | 7519d50 | 2021-07-19 16:47:58 +0100 | [diff] [blame] | 366 | full_ofm_depth = weight_tens.values.shape[-1] |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 367 | ofm_block_depth = block_config.ofm_block.depth |
| 368 | |
| 369 | weight_range_index = 0 |
| 370 | for idx, depth_offset in enumerate(depth_offsets[:-1]): |
| 371 | # Do not generate for offsets outside the OFM |
| 372 | assert depth_offset >= 0 and depth_offset < full_ofm_depth |
| 373 | depth_length = depth_offsets[idx + 1] - depth_offset |
| 374 | |
| 375 | # Get the weights necessary for this brick |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 376 | if do_weights: |
| 377 | brick_weights = weights[:, :, :, depth_offset : depth_offset + depth_length] |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 378 | |
| 379 | buffer_start_offset = len(encoded_stream) |
| 380 | |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 381 | # For each core, deinterleave weights/scales from the larger volume |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 382 | # and generate separate compressed streams. |
| 383 | for core in range(0, min(arch.ncores, full_ofm_depth)): |
| 384 | |
| 385 | core_block_depth = int((ofm_block_depth + arch.ncores - 1 - core) // arch.ncores) |
| 386 | |
| 387 | if core_block_depth != 0: |
| 388 | key = WeightKey(core, depth_offset) |
| 389 | weight_range = WeightRange() |
| 390 | weight_range.offset = len(encoded_stream) |
| 391 | weight_range.index = weight_range_index |
| 392 | weight_range_index += 1 |
| 393 | |
| 394 | # Scales & biases |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 395 | if do_scales: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 396 | scale_stream = [] |
| 397 | core_scales = quantised_scales[ |
| 398 | depth_offset + core : depth_offset + core + depth_length : arch.ncores |
| 399 | ] |
| 400 | core_biases = biases[depth_offset + core : depth_offset + core + depth_length : arch.ncores] |
| 401 | for j, core_bias in enumerate(core_biases): |
| 402 | scale_stream.extend(encode_bias(np.int64(core_bias), *core_scales[j])) |
| 403 | |
| 404 | weight_range.scale_bytes = len(scale_stream) |
| 405 | |
| 406 | encoded_stream.extend(scale_stream) |
| 407 | |
| 408 | # Align to 16 for start of next substream |
| 409 | remainder = len(encoded_stream) % 16 |
| 410 | if remainder > 0: |
| 411 | encoded_stream.extend(bytearray(16 - remainder)) |
| 412 | |
| 413 | # Weights |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 414 | if do_weights: |
| 415 | core_weights = core_deinterleave(brick_weights, core, arch.ncores) |
| 416 | encoded_substream, _ = encode_weights( |
| 417 | accelerator=arch.accelerator_config, |
| 418 | weights_volume=core_weights, |
| 419 | dilation_xy=kernel.dilation, |
| 420 | ifm_bitdepth=ifm_bitdepth, |
| 421 | ofm_block_depth=core_block_depth, |
| 422 | is_depthwise=is_depthwise, |
| 423 | block_traversal=npu_tensor.hw_traversal, |
| 424 | ) |
| 425 | weight_range.weight_offset = len(encoded_stream) - weight_range.offset |
| 426 | weight_range.weight_bytes = len(encoded_substream) |
| 427 | # Append encoded section |
| 428 | encoded_stream.extend(encoded_substream) |
| 429 | assert len(encoded_stream) % 16 == 0 |
Diqing Zhong | 66d7ec0 | 2021-02-01 19:07:04 +0100 | [diff] [blame] | 430 | |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 431 | # Record encoded range in tensor |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 432 | npu_tensor.encoded_ranges[key] = weight_range |
| 433 | |
| 434 | # Remember maximum encoded length for DoubleBuffering |
| 435 | max_single_buffer_len = max(max_single_buffer_len, len(encoded_stream) - buffer_start_offset) |
| 436 | |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 437 | # Attach buffer to tensor |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 438 | npu_tensor.buffer = encoded_stream |
| 439 | npu_tensor.max_range_bytes = max_single_buffer_len |
| 440 | npu_tensor.set_all_shapes([1, 1, 1, len(encoded_stream)]) |
| 441 | npu_tensor.format = TensorFormat.WeightsCompressed |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 442 | |
| 443 | # Scale only tensor |
| 444 | if not do_weights: |
| 445 | npu_tensor.weight_compression_config = None |
| 446 | npu_tensor.purpose = TensorPurpose.FSBias |
| 447 | npu_tensor.mem_area = scale_tens.mem_area |
| 448 | npu_tensor.mem_type = scale_tens.mem_type |
| 449 | weights_tensor = tens_cached |
| 450 | scale_tensor = npu_tensor |
| 451 | else: |
| 452 | npu_tensor.purpose = TensorPurpose.Weights |
| 453 | npu_tensor.mem_area = weight_tens.mem_area |
| 454 | npu_tensor.mem_type = weight_tens.mem_type |
| 455 | weights_tensor = npu_tensor |
| 456 | scale_tensor = None |
| 457 | CompressedWeightCache.add(weights_tensor) |
| 458 | |
| 459 | return weights_tensor, scale_tensor |