Raul Farkas | 428a8d5 | 2023-01-16 16:52:18 +0000 | [diff] [blame] | 1 | # SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com> |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 2 | # |
| 3 | # SPDX-License-Identifier: Apache-2.0 |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the License); you may |
| 6 | # not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an AS IS BASIS, WITHOUT |
| 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
Rickard Bolin | bc6ee58 | 2022-11-04 08:24:29 +0000 | [diff] [blame] | 16 | # |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 17 | # Description: |
| 18 | # Compresses and pads the weigths. It also calculates the scales and packs with the biases. |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 19 | from collections import namedtuple |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 20 | from collections import OrderedDict |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 21 | from typing import Dict |
| 22 | from typing import Optional |
Louis Verhaard | aeae567 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 23 | from typing import Tuple |
Diego Russo | ea6111a | 2020-04-14 18:41:58 +0100 | [diff] [blame] | 24 | |
| 25 | import numpy as np |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 26 | |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 27 | from .api import NpuBlockTraversal |
Manupa Karunaratne | d83d2e1 | 2020-07-20 12:05:32 +0100 | [diff] [blame] | 28 | from .architecture_features import Accelerator |
| 29 | from .architecture_features import ArchitectureFeatures |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 30 | from .data_type import DataType |
Louis Verhaard | 7db7896 | 2020-05-25 15:05:26 +0200 | [diff] [blame] | 31 | from .errors import UnsupportedFeatureError |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 32 | from .numeric_util import round_up |
| 33 | from .operation import NpuBlockType |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 34 | from .operation import Op |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 35 | from .scaling import quantise_scale |
| 36 | from .scaling import reduced_quantise_scale |
Johan Alfven | 347c57b | 2023-04-03 15:29:13 +0200 | [diff] [blame] | 37 | from .tensor import QuantizationParameters |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 38 | from .tensor import Tensor |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 39 | from .tensor import TensorFormat |
| 40 | from .tensor import TensorPurpose |
Raul Farkas | 428a8d5 | 2023-01-16 16:52:18 +0000 | [diff] [blame] | 41 | |
| 42 | # Handle any errors thrown by NumPy while importing mlw_codec module |
| 43 | try: |
| 44 | from ethosu import mlw_codec |
| 45 | except RuntimeError as ex: |
| 46 | if "mlw_codec error: module compiled against API version" in str(ex): |
| 47 | # Extract API versions from error message |
| 48 | matches = [s for s in str(ex).split() if "0x" in s] |
| 49 | if len(matches) == 2: |
| 50 | # Raise new exception with more detailed message |
| 51 | raise ImportError( # pylint: disable=W0707 |
| 52 | "NumPy C API version mismatch " |
| 53 | f"(Build-time version: {matches[0]}, " |
| 54 | f"Run-time version: {matches[1]})" |
| 55 | "\nThis is a known issue most likely caused by a change in the API " |
| 56 | "version in NumPy after installing ethos-u-vela.\nYou can find more " |
| 57 | "information about the issue and possible solutions in the " |
| 58 | "'Known Issues' section at https://review.mlplatform.org/" |
| 59 | "plugins/gitiles/ml/ethos-u/ethos-u-vela/+/refs/heads/main/" |
| 60 | "README.md#known-issues" |
| 61 | ) |
| 62 | raise |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 63 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 64 | |
Louis Verhaard | 3c07c97 | 2020-05-07 08:12:58 +0200 | [diff] [blame] | 65 | # Contains meta info for a weight compression. If two tensors have identical weight compression config, |
| 66 | # then they also will have identical compressed weights. |
| 67 | WeightCompressionConfig = namedtuple( |
Jonas Ohlsson | d857507 | 2022-03-30 10:30:25 +0200 | [diff] [blame] | 68 | "WeightCompressionConfig", |
| 69 | ["npu_block_type", "ofm_block_depth", "ofm_depth_step", "dilation", "weight_value_id"], |
Louis Verhaard | 3c07c97 | 2020-05-07 08:12:58 +0200 | [diff] [blame] | 70 | ) |
| 71 | |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 72 | ScaleCompressionConfig = namedtuple("ScaleCompressionConfig", ["scale_value_id", "ifm_scale", "ofm_scale"]) |
| 73 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 74 | WeightKey = namedtuple("WeightKey", ["core", "depth"]) |
| 75 | |
| 76 | |
| 77 | class WeightRange: |
| 78 | def __init__(self): |
| 79 | self.offset = 0 |
| 80 | self.scale_bytes = 0 |
| 81 | self.weight_offset = 0 |
| 82 | self.weight_bytes = 0 |
| 83 | self.index = 0 |
| 84 | |
| 85 | @property |
| 86 | def total_bytes(self): |
| 87 | return self.scale_bytes + self.weight_bytes |
| 88 | |
| 89 | |
| 90 | class NpuWeightTensor(Tensor): |
| 91 | def __init__(self, name): |
| 92 | Tensor.__init__(self, None, None, name + "_npu_encoded_weights") |
| 93 | self.buffer = [] |
Rickard Bolin | fd8b500 | 2022-05-16 09:11:06 +0000 | [diff] [blame] | 94 | self.double_buffer_sizes = [0, 0] # Required sizes if double buffering is used |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 95 | self.encoded_ranges = OrderedDict() |
| 96 | self.hw_traversal = NpuBlockTraversal.DEPTH_FIRST |
| 97 | self.dtype = DataType.uint8 |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 98 | self.scale_compression_config = None |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 99 | |
Rickard Bolin | fd8b500 | 2022-05-16 09:11:06 +0000 | [diff] [blame] | 100 | def max_range_bytes(self): |
| 101 | return max(self.double_buffer_sizes) |
| 102 | |
| 103 | def double_buffer_size(self): |
| 104 | """Return total required size for double buffering""" |
| 105 | return sum(self.double_buffer_sizes) |
| 106 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 107 | |
| 108 | class CompressedWeightCache: |
| 109 | """Global tensor weight compression cache""" |
| 110 | |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 111 | cache: Dict[WeightCompressionConfig, Tensor] = {} |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 112 | |
| 113 | @staticmethod |
| 114 | def get_tensor_with_same_compression(wcc): |
| 115 | return CompressedWeightCache.cache.get(wcc) |
| 116 | |
| 117 | @staticmethod |
| 118 | def add(tens): |
| 119 | # Adds the compressed weights from the tensor to the cache |
| 120 | wcc = tens.weight_compression_config |
| 121 | CompressedWeightCache.cache[wcc] = tens |
| 122 | |
| 123 | @staticmethod |
| 124 | def has_tensor_with_same_compression(wcc): |
| 125 | return wcc in CompressedWeightCache.cache |
| 126 | |
| 127 | @staticmethod |
| 128 | def get_unencoded_size_with_same_compression(wcc): |
| 129 | cache_obj = CompressedWeightCache.cache.get(wcc) |
| 130 | return cache_obj[1] if cache_obj else None |
| 131 | |
| 132 | |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 133 | def create_weight_compression_config(weight_tens, npu_block_type, ofm_block_depth, ofm_depth_step, dilation): |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 134 | # Note: for an ofm block only its depth is used in weight compression. |
| 135 | # And block depth > ofm depth gives same result as block depth == ofm depth |
James Peet | 7519d50 | 2021-07-19 16:47:58 +0100 | [diff] [blame] | 136 | block_depth = min(ofm_block_depth, weight_tens.values.shape[-1]) |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 137 | return WeightCompressionConfig(npu_block_type, block_depth, ofm_depth_step, dilation, weight_tens.value_id) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 138 | |
Louis Verhaard | 3c07c97 | 2020-05-07 08:12:58 +0200 | [diff] [blame] | 139 | |
Manupa Karunaratne | d83d2e1 | 2020-07-20 12:05:32 +0100 | [diff] [blame] | 140 | def encode_weights( |
| 141 | accelerator: Accelerator, |
| 142 | weights_volume: np.ndarray, |
Louis Verhaard | aeae567 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 143 | dilation_xy: Tuple[int, int], |
Manupa Karunaratne | d83d2e1 | 2020-07-20 12:05:32 +0100 | [diff] [blame] | 144 | ifm_bitdepth: int, |
| 145 | ofm_block_depth: int, |
| 146 | is_depthwise: bool, |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 147 | block_traversal: NpuBlockTraversal, |
Manupa Karunaratne | d83d2e1 | 2020-07-20 12:05:32 +0100 | [diff] [blame] | 148 | ): |
| 149 | """ |
Louis Verhaard | aeae567 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 150 | Internal implementation of the public facing API to use weight encoding. |
Manupa Karunaratne | d83d2e1 | 2020-07-20 12:05:32 +0100 | [diff] [blame] | 151 | |
Tim Hall | c8a7386 | 2020-10-27 12:43:14 +0000 | [diff] [blame] | 152 | :param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator |
Manupa Karunaratne | d83d2e1 | 2020-07-20 12:05:32 +0100 | [diff] [blame] | 153 | :param weights_volume: numpy.ndarray in OHWI layout with a shape of four |
| 154 | :param dilation_xy: a two element tuple of dilation attributes in x,y dimension |
| 155 | :param ifm_bitdepth: the bitdepth of input feature map |
Tim Hall | c8a7386 | 2020-10-27 12:43:14 +0000 | [diff] [blame] | 156 | :param ofm_block_depth: the depth of blocks for Ethos-U processing |
Manupa Karunaratne | d83d2e1 | 2020-07-20 12:05:32 +0100 | [diff] [blame] | 157 | :param is_depthwise: a boolean indicating these weights are used for a depthwise traversal |
Louis Verhaard | aeae567 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 158 | :param block_traversal: indicates how these weights are traversed on sub-kernel basis |
| 159 | |
Fredrik Svedberg | f5c07c4 | 2021-04-23 14:36:42 +0200 | [diff] [blame] | 160 | :return: a tuple with a bytearray of encoded weights and the size of the unencoded weights |
Manupa Karunaratne | d83d2e1 | 2020-07-20 12:05:32 +0100 | [diff] [blame] | 161 | """ |
Manupa Karunaratne | 8b24f2b | 2020-08-12 18:26:39 +0000 | [diff] [blame] | 162 | # Check arg types |
| 163 | assert isinstance(accelerator, Accelerator) |
| 164 | assert isinstance(weights_volume, np.ndarray) |
| 165 | assert isinstance(dilation_xy, tuple) |
| 166 | assert isinstance(ifm_bitdepth, int) |
| 167 | assert isinstance(ofm_block_depth, int) |
| 168 | assert isinstance(is_depthwise, bool) |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 169 | assert isinstance(block_traversal, NpuBlockTraversal) |
Manupa Karunaratne | 8b24f2b | 2020-08-12 18:26:39 +0000 | [diff] [blame] | 170 | |
Manupa Karunaratne | d83d2e1 | 2020-07-20 12:05:32 +0100 | [diff] [blame] | 171 | # Checks for weight layout |
| 172 | assert len(weights_volume.shape) == 4, "weights ndarray should have a shape of 4" |
| 173 | |
| 174 | # It cannot be both partkernel and depthwise |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 175 | assert not ( |
| 176 | is_depthwise and block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST |
| 177 | ), "encode_weights :: partkernel and depthwise are mutually exclusive" |
Manupa Karunaratne | d83d2e1 | 2020-07-20 12:05:32 +0100 | [diff] [blame] | 178 | |
| 179 | # Check valid values for dilation |
| 180 | assert dilation_xy[0] in (1, 2), "encode_weights :: dilation x should be 1 or 2 not {}".format(dilation_xy[0]) |
| 181 | assert dilation_xy[1] in (1, 2), "encode_weights :: dilation y should be 1 or 2 not {}".format(dilation_xy[1]) |
| 182 | |
| 183 | ifm_ublock = ArchitectureFeatures.accelerator_configs[accelerator].ifm_ublock |
| 184 | ofm_ublock = ArchitectureFeatures.accelerator_configs[accelerator].ofm_ublock |
James Peet | c244982 | 2021-07-19 17:09:16 +0100 | [diff] [blame] | 185 | decomp_h = ArchitectureFeatures.SubKernelMax.height // dilation_xy[1] |
| 186 | decomp_w = ArchitectureFeatures.SubKernelMax.width // dilation_xy[0] |
Mauricio Briceno | 67e11f7 | 2021-05-05 12:47:28 +0200 | [diff] [blame] | 187 | |
| 188 | return mlw_codec.reorder_encode( |
| 189 | ifm_ublock.depth, |
| 190 | ofm_ublock.depth, |
| 191 | weights_volume, |
| 192 | ofm_block_depth, |
| 193 | is_depthwise, |
| 194 | block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST, |
| 195 | ifm_bitdepth, |
| 196 | decomp_h, |
| 197 | decomp_w, |
Manupa Karunaratne | d83d2e1 | 2020-07-20 12:05:32 +0100 | [diff] [blame] | 198 | ) |
Manupa Karunaratne | d83d2e1 | 2020-07-20 12:05:32 +0100 | [diff] [blame] | 199 | |
| 200 | |
Manupa Karunaratne | bef228b | 2020-07-29 18:06:28 +0100 | [diff] [blame] | 201 | def encode_bias(bias: np.int64, scale: int, shift: int): |
| 202 | """ |
Louis Verhaard | aeae567 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 203 | Internal implementation of public facing API to pack bias and scale values as required by the Ethos-U |
Tim Hall | c8a7386 | 2020-10-27 12:43:14 +0000 | [diff] [blame] | 204 | |
Manupa Karunaratne | bef228b | 2020-07-29 18:06:28 +0100 | [diff] [blame] | 205 | :param bias: 64bit signed number that includes 40bit signed bias |
| 206 | :param scale: 32bit scale value |
| 207 | :param shift: 6bit shift value |
| 208 | :return: packed 80bit [0(2-bits),shift(6-bits),scale(32-bits),bias(40-bits)] |
| 209 | """ |
Manupa Karunaratne | 8b24f2b | 2020-08-12 18:26:39 +0000 | [diff] [blame] | 210 | # Check arg types |
| 211 | assert isinstance(bias, np.int64) |
| 212 | assert isinstance(scale, int) |
| 213 | assert isinstance(shift, int) |
| 214 | |
Manupa Karunaratne | bef228b | 2020-07-29 18:06:28 +0100 | [diff] [blame] | 215 | assert -(1 << (40 - 1)) <= bias < (1 << (40 - 1)) # signed 40-bit range |
| 216 | assert 0 <= scale < (1 << 32) # unsigned 32-bit range |
| 217 | assert 0 <= shift < (1 << 6) # unsigned 6-bit range |
| 218 | |
| 219 | data = bytearray(10) |
| 220 | data[0] = (bias >> (0 * 8)) & 0xFF |
| 221 | data[1] = (bias >> (1 * 8)) & 0xFF |
| 222 | data[2] = (bias >> (2 * 8)) & 0xFF |
| 223 | data[3] = (bias >> (3 * 8)) & 0xFF |
| 224 | data[4] = (bias >> (4 * 8)) & 0xFF |
| 225 | data[5] = (scale >> (0 * 8)) & 0xFF |
| 226 | data[6] = (scale >> (1 * 8)) & 0xFF |
| 227 | data[7] = (scale >> (2 * 8)) & 0xFF |
| 228 | data[8] = (scale >> (3 * 8)) & 0xFF |
| 229 | data[9] = shift & 0x3F |
| 230 | return data |
| 231 | |
| 232 | |
Tim Hall | f7e810a | 2020-06-25 15:04:31 +0100 | [diff] [blame] | 233 | def core_deinterleave(hwio, core, ncores): |
| 234 | # Put weights back into OHWI |
Jacob Bohlin | e843d33 | 2020-06-23 12:12:56 +0200 | [diff] [blame] | 235 | ohwi = np.transpose(hwio, (3, 0, 1, 2)) |
| 236 | return ohwi[core : ohwi.shape[0] : ncores] |
| 237 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 238 | |
Johan Alfven | 347c57b | 2023-04-03 15:29:13 +0200 | [diff] [blame] | 239 | def _get_input_quantization(op): |
| 240 | quant = op.get_input_quantization() |
| 241 | if not quant: |
| 242 | quant = QuantizationParameters(scale_f32=1.0, zero_point=0) |
| 243 | return quant |
| 244 | |
| 245 | |
| 246 | def _get_output_quantization(op): |
| 247 | quant = op.get_output_quantization() |
| 248 | if not quant: |
| 249 | quant = QuantizationParameters(scale_f32=1.0, zero_point=0) |
| 250 | return quant |
| 251 | |
| 252 | |
Patrik Gustavsson | 8f1f9aa | 2021-06-28 07:41:58 +0200 | [diff] [blame] | 253 | def _prepare_scale_and_bias(arch, tens, rescale_for_faf, explicit_scaling): |
Andreas Nevalainen | 897cc14 | 2020-10-28 15:42:08 +0100 | [diff] [blame] | 254 | assert tens.purpose in [TensorPurpose.FeatureMap, TensorPurpose.FSBias] |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 255 | assert tens.format == TensorFormat.NHWC |
| 256 | # the connected operator should expect a bias input unless it is a FullyConnected |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 257 | assert tens.consumer_list[0].type.needs_bias() |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 258 | # the input bias tensor is the same as that connected to the operator |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 259 | bias_tens = tens.consumer_list[0].bias |
Jacob Bohlin | cf7da10 | 2020-05-20 09:03:40 +0200 | [diff] [blame] | 260 | assert tens is bias_tens |
| 261 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 262 | # the operator should only have a single output |
| 263 | assert len(tens.consumer_list[0].outputs) == 1 |
James Peet | 7519d50 | 2021-07-19 16:47:58 +0100 | [diff] [blame] | 264 | biases = tens.values |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 265 | |
| 266 | first_consumer_op = tens.consumer_list[0] |
| 267 | ifm_dtype = first_consumer_op.inputs[0].dtype |
Johan Alfven | 347c57b | 2023-04-03 15:29:13 +0200 | [diff] [blame] | 268 | ifm_scale = _get_input_quantization(first_consumer_op).scale_f32 |
| 269 | ofm_scale = _get_output_quantization(first_consumer_op).scale_f32 |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 270 | weight_scales = first_consumer_op.inputs[1].quantization.scale_f32 |
| 271 | |
| 272 | # biases can have multiple consumers for rnn cells. if so, then check that they are all the same |
| 273 | for op in tens.consumer_list[1:]: |
Johan Alfven | 347c57b | 2023-04-03 15:29:13 +0200 | [diff] [blame] | 274 | assert ifm_scale == _get_input_quantization(op).scale_f32 |
| 275 | assert ofm_scale == _get_output_quantization(op).scale_f32 |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 276 | assert weight_scales == op.inputs[1].quantization.scale_f32 |
| 277 | |
| 278 | if not hasattr(weight_scales, "__iter__"): |
| 279 | # If weight_scales is not already an iterable make it into a list |
| 280 | weight_scales = [weight_scales] |
| 281 | |
| 282 | # Convert scales to np.double (from np.float32) to conform to TensorFlow Lite which |
| 283 | # uses double during scaling calculations |
Fredrik Svedberg | bb98851 | 2023-03-09 13:22:40 +0100 | [diff] [blame] | 284 | # TensorFlow Lite casts the scales slightly differently for uint8 and int8 as well as |
| 285 | # for FullyConnected operators |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 286 | if not rescale_for_faf: |
Johan Alfven | 7ede317 | 2023-05-04 12:47:25 +0200 | [diff] [blame^] | 287 | if ifm_dtype == DataType.uint8 or first_consumer_op.original_type == Op.FullyConnected: |
Fredrik Svedberg | bb98851 | 2023-03-09 13:22:40 +0100 | [diff] [blame] | 288 | scales = [np.double(ifm_scale * weight_scale) / np.double(ofm_scale) for weight_scale in weight_scales] |
Fredrik Svedberg | d67c0aa | 2020-03-30 13:15:28 +0200 | [diff] [blame] | 289 | elif ifm_dtype == DataType.int8 or ifm_dtype == DataType.int16: |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 290 | scales = [ |
| 291 | (np.double(ifm_scale) * np.double(weight_scale)) / np.double(ofm_scale) |
| 292 | for weight_scale in weight_scales |
| 293 | ] |
| 294 | else: |
Michael McGeagh | 7a6f843 | 2020-12-02 15:29:22 +0000 | [diff] [blame] | 295 | raise UnsupportedFeatureError(f"Compression of {ifm_dtype} is not implemented; Tensor: '{tens.name}'") |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 296 | else: |
| 297 | if ifm_dtype == DataType.uint8: |
| 298 | scales = [np.double(ifm_scale * weight_scale * 0x3000) for weight_scale in weight_scales] |
Fredrik Svedberg | d67c0aa | 2020-03-30 13:15:28 +0200 | [diff] [blame] | 299 | elif ifm_dtype == DataType.int8 or ifm_dtype == DataType.int16: |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 300 | scales = [(np.double(ifm_scale * 0x3000) * np.double(weight_scale)) for weight_scale in weight_scales] |
| 301 | else: |
Michael McGeagh | 7a6f843 | 2020-12-02 15:29:22 +0000 | [diff] [blame] | 302 | raise UnsupportedFeatureError(f"Compression of {ifm_dtype} is not implemented; Tensor: '{tens.name}'") |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 303 | |
Patrik Gustavsson | 8f1f9aa | 2021-06-28 07:41:58 +0200 | [diff] [blame] | 304 | if explicit_scaling: |
| 305 | assert len(explicit_scaling.shift) == len(explicit_scaling.multiplier) |
| 306 | quantised_scales = [(int(m), int(s)) for s, m in zip(explicit_scaling.shift, explicit_scaling.multiplier)] |
Fredrik Svedberg | d67c0aa | 2020-03-30 13:15:28 +0200 | [diff] [blame] | 307 | else: |
Patrik Gustavsson | 8f1f9aa | 2021-06-28 07:41:58 +0200 | [diff] [blame] | 308 | # quantise all of the weight scales into (scale_factor, shift) |
Fredrik Svedberg | cc219be | 2022-09-20 16:32:52 +0200 | [diff] [blame] | 309 | if ifm_dtype == DataType.int16 and bias_tens.dtype == DataType.int64: |
| 310 | # Reference uses reduced scaling for int16 with int64 bias |
Patrik Gustavsson | 8f1f9aa | 2021-06-28 07:41:58 +0200 | [diff] [blame] | 311 | quantised_scales = [reduced_quantise_scale(scale) for scale in scales] |
| 312 | else: |
| 313 | quantised_scales = [quantise_scale(scale) for scale in scales] |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 314 | |
Rickard Bolin | fea1516 | 2022-07-04 16:19:16 +0000 | [diff] [blame] | 315 | # Check the output quantisation to see if the scale value needs increasing to the next one |
Johan Alfven | 347c57b | 2023-04-03 15:29:13 +0200 | [diff] [blame] | 316 | if _get_output_quantization(first_consumer_op).next_after: |
Rickard Bolin | fea1516 | 2022-07-04 16:19:16 +0000 | [diff] [blame] | 317 | for i, quant_scale in enumerate(quantised_scales): |
| 318 | q_scale, q_shift = quant_scale |
| 319 | quantised_scales[i] = (q_scale + 1, q_shift) |
| 320 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 321 | # If only 1 quantised scale is used, repeat that value for the length of the biases |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 322 | if len(quantised_scales) == 1: |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 323 | quantised_scales = [quantised_scales[0]] * len(biases) |
| 324 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 325 | return quantised_scales, biases |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 326 | |
Jacob Bohlin | e843d33 | 2020-06-23 12:12:56 +0200 | [diff] [blame] | 327 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 328 | def encode_weight_and_scale_tensor( |
| 329 | arch, op, weight_tens, scale_tens, kernel, block_config, depth_offsets, rescale_for_faf=False |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 330 | ) -> Tuple[Optional[NpuWeightTensor], Optional[NpuWeightTensor]]: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 331 | npu_block_type = op.type.npu_block_type |
| 332 | |
Johan Alfven | 347c57b | 2023-04-03 15:29:13 +0200 | [diff] [blame] | 333 | ifm_scale = scale_tens and _get_input_quantization(scale_tens.consumer_list[0]).scale_f32 |
| 334 | ofm_scale = scale_tens and _get_output_quantization(scale_tens.consumer_list[0]).scale_f32 |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 335 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 336 | wcc = create_weight_compression_config( |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 337 | weight_tens, npu_block_type, block_config.ofm_block.depth, hash(str(depth_offsets)), kernel.dilation |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 338 | ) |
| 339 | |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 340 | scc = ScaleCompressionConfig(scale_tens and scale_tens.value_id, ifm_scale, ofm_scale) |
| 341 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 342 | tens_cached = CompressedWeightCache.get_tensor_with_same_compression(wcc) |
| 343 | if tens_cached is not None: |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 344 | if tens_cached.scale_compression_config == scc: |
| 345 | return tens_cached, None |
| 346 | npu_tensor = NpuWeightTensor(scale_tens.name) |
| 347 | do_weights = False |
| 348 | do_scales = True |
| 349 | else: |
| 350 | npu_tensor = NpuWeightTensor(weight_tens.name) |
| 351 | do_weights = True |
| 352 | do_scales = True |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 353 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 354 | npu_tensor.weight_compression_config = wcc |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 355 | npu_tensor.scale_compression_config = scc |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 356 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 357 | # Ensure depth offsets are terminated at end of OFM shape |
| 358 | assert len(depth_offsets) > 1, "Require closed depth ranges" |
| 359 | |
| 360 | ifm_bitdepth = op.inputs[0].dtype.size_in_bits() |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 361 | |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 362 | # No cache hit, need to perform the encoding |
| 363 | if do_weights: |
| 364 | assert weight_tens.quantization is not None |
Patrik Gustavsson | b081d67 | 2021-08-25 13:49:25 +0200 | [diff] [blame] | 365 | assert weight_tens.quantization.scale_f32 is not None or op.explicit_scaling |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 366 | assert weight_tens.quantization.zero_point is not None |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 367 | |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 368 | # Early zero-point correction |
James Peet | 7519d50 | 2021-07-19 16:47:58 +0100 | [diff] [blame] | 369 | quant_buf = weight_tens.values.astype(np.int16) |
Tim Hall | b279844 | 2021-06-24 19:31:38 +0100 | [diff] [blame] | 370 | # the zero point can be either a native or numpy type |
| 371 | if isinstance(weight_tens.quantization.zero_point, (int, float)): |
| 372 | zero_point = np.int16(weight_tens.quantization.zero_point) |
| 373 | else: |
| 374 | zero_point = weight_tens.quantization.zero_point.astype(np.int16) |
| 375 | weights = quant_buf - zero_point |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 376 | |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 377 | if len(weights.shape) == 2: |
| 378 | weights = np.expand_dims(np.expand_dims(weights, axis=0), axis=0) |
| 379 | |
| 380 | # Expect this (undilated) equivalence |
| 381 | assert kernel.height == weights.shape[0] |
| 382 | assert kernel.width == weights.shape[1] |
| 383 | |
| 384 | ifm_depth = weights.shape[-2] |
| 385 | |
| 386 | # Default HW traversal |
| 387 | npu_tensor.hw_traversal = NpuBlockTraversal.DEPTH_FIRST |
| 388 | |
| 389 | if npu_block_type == NpuBlockType.ConvolutionMxN: |
| 390 | # Determine which block traversal strategy has better DPU utilization |
| 391 | kernel_size = weights.shape[0] * weights.shape[1] |
| 392 | depth_utilization = weights.shape[2] / round_up(weights.shape[2], 32 if ifm_bitdepth == 8 else 16) |
| 393 | part_kernel_utilization = (weights.shape[2] / round_up(weights.shape[2], 8)) * ( |
| 394 | kernel_size / round_up(kernel_size, 4 if ifm_bitdepth == 8 else 2) |
| 395 | ) |
| 396 | if part_kernel_utilization >= depth_utilization or ifm_depth <= 8: |
| 397 | # Part-kernel first is always better for ifm depths <= 8 |
| 398 | npu_tensor.hw_traversal = NpuBlockTraversal.PART_KERNEL_FIRST |
| 399 | |
| 400 | if op.type == Op.Conv2DBackpropInputSwitchedBias: |
| 401 | # Transpose Convoluion, reverse weights in H and W axes |
| 402 | weights = np.flip(weights, axis=(0, 1)) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 403 | |
| 404 | encoded_stream = bytearray() |
Rickard Bolin | fd8b500 | 2022-05-16 09:11:06 +0000 | [diff] [blame] | 405 | double_buffer_sizes = [0, 0] |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 406 | is_depthwise = npu_block_type == NpuBlockType.ConvolutionDepthWise |
| 407 | |
| 408 | # Bias & scale |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 409 | if do_scales: |
Patrik Gustavsson | 8f1f9aa | 2021-06-28 07:41:58 +0200 | [diff] [blame] | 410 | quantised_scales, biases = _prepare_scale_and_bias(arch, scale_tens, rescale_for_faf, op.explicit_scaling) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 411 | scale_tens.element_size_bytes = 10 |
| 412 | |
| 413 | # Slice the weight stream up depth-ways into bricks and compress |
James Peet | 7519d50 | 2021-07-19 16:47:58 +0100 | [diff] [blame] | 414 | full_ofm_depth = weight_tens.values.shape[-1] |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 415 | ofm_block_depth = block_config.ofm_block.depth |
| 416 | |
| 417 | weight_range_index = 0 |
| 418 | for idx, depth_offset in enumerate(depth_offsets[:-1]): |
| 419 | # Do not generate for offsets outside the OFM |
| 420 | assert depth_offset >= 0 and depth_offset < full_ofm_depth |
| 421 | depth_length = depth_offsets[idx + 1] - depth_offset |
| 422 | |
| 423 | # Get the weights necessary for this brick |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 424 | if do_weights: |
| 425 | brick_weights = weights[:, :, :, depth_offset : depth_offset + depth_length] |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 426 | |
| 427 | buffer_start_offset = len(encoded_stream) |
| 428 | |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 429 | # For each core, deinterleave weights/scales from the larger volume |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 430 | # and generate separate compressed streams. |
| 431 | for core in range(0, min(arch.ncores, full_ofm_depth)): |
| 432 | |
| 433 | core_block_depth = int((ofm_block_depth + arch.ncores - 1 - core) // arch.ncores) |
| 434 | |
| 435 | if core_block_depth != 0: |
| 436 | key = WeightKey(core, depth_offset) |
| 437 | weight_range = WeightRange() |
| 438 | weight_range.offset = len(encoded_stream) |
| 439 | weight_range.index = weight_range_index |
| 440 | weight_range_index += 1 |
| 441 | |
| 442 | # Scales & biases |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 443 | if do_scales: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 444 | scale_stream = [] |
| 445 | core_scales = quantised_scales[ |
| 446 | depth_offset + core : depth_offset + core + depth_length : arch.ncores |
| 447 | ] |
| 448 | core_biases = biases[depth_offset + core : depth_offset + core + depth_length : arch.ncores] |
| 449 | for j, core_bias in enumerate(core_biases): |
| 450 | scale_stream.extend(encode_bias(np.int64(core_bias), *core_scales[j])) |
| 451 | |
| 452 | weight_range.scale_bytes = len(scale_stream) |
| 453 | |
| 454 | encoded_stream.extend(scale_stream) |
| 455 | |
| 456 | # Align to 16 for start of next substream |
| 457 | remainder = len(encoded_stream) % 16 |
| 458 | if remainder > 0: |
| 459 | encoded_stream.extend(bytearray(16 - remainder)) |
| 460 | |
| 461 | # Weights |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 462 | if do_weights: |
| 463 | core_weights = core_deinterleave(brick_weights, core, arch.ncores) |
| 464 | encoded_substream, _ = encode_weights( |
| 465 | accelerator=arch.accelerator_config, |
| 466 | weights_volume=core_weights, |
| 467 | dilation_xy=kernel.dilation, |
| 468 | ifm_bitdepth=ifm_bitdepth, |
| 469 | ofm_block_depth=core_block_depth, |
| 470 | is_depthwise=is_depthwise, |
| 471 | block_traversal=npu_tensor.hw_traversal, |
| 472 | ) |
| 473 | weight_range.weight_offset = len(encoded_stream) - weight_range.offset |
| 474 | weight_range.weight_bytes = len(encoded_substream) |
| 475 | # Append encoded section |
| 476 | encoded_stream.extend(encoded_substream) |
| 477 | assert len(encoded_stream) % 16 == 0 |
Diqing Zhong | 66d7ec0 | 2021-02-01 19:07:04 +0100 | [diff] [blame] | 478 | |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 479 | # Record encoded range in tensor |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 480 | npu_tensor.encoded_ranges[key] = weight_range |
| 481 | |
| 482 | # Remember maximum encoded length for DoubleBuffering |
Rickard Bolin | fd8b500 | 2022-05-16 09:11:06 +0000 | [diff] [blame] | 483 | double_buffer_sizes[idx % 2] = max(double_buffer_sizes[idx % 2], len(encoded_stream) - buffer_start_offset) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 484 | |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 485 | # Attach buffer to tensor |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 486 | npu_tensor.buffer = encoded_stream |
Rickard Bolin | fd8b500 | 2022-05-16 09:11:06 +0000 | [diff] [blame] | 487 | npu_tensor.double_buffer_sizes = double_buffer_sizes |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 488 | npu_tensor.set_all_shapes([1, 1, 1, len(encoded_stream)]) |
| 489 | npu_tensor.format = TensorFormat.WeightsCompressed |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 490 | |
| 491 | # Scale only tensor |
| 492 | if not do_weights: |
| 493 | npu_tensor.weight_compression_config = None |
| 494 | npu_tensor.purpose = TensorPurpose.FSBias |
| 495 | npu_tensor.mem_area = scale_tens.mem_area |
| 496 | npu_tensor.mem_type = scale_tens.mem_type |
| 497 | weights_tensor = tens_cached |
| 498 | scale_tensor = npu_tensor |
| 499 | else: |
| 500 | npu_tensor.purpose = TensorPurpose.Weights |
| 501 | npu_tensor.mem_area = weight_tens.mem_area |
| 502 | npu_tensor.mem_type = weight_tens.mem_type |
| 503 | weights_tensor = npu_tensor |
| 504 | scale_tensor = None |
| 505 | CompressedWeightCache.add(weights_tensor) |
| 506 | |
| 507 | return weights_tensor, scale_tensor |