Blame - ethosu/vela/weight_compressor.py - ml/ethos-u/ethos-u-vela

blob: e56cc5e58d5c5173bb6ee1104bfc4d4849308f27 [file] [log] [blame]

Raul Farkas	428a8d5	2023-01-16 16:52:18 +0000	[diff] [blame^]	1	# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Rickard Bolin	bc6ee58	2022-11-04 08:24:29 +0000	[diff] [blame]	16	#
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	17	# Description:
				18	# Compresses and pads the weigths. It also calculates the scales and packs with the biases.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	19	from collections import namedtuple
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	20	from collections import OrderedDict
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	21	from typing import Dict
				22	from typing import Optional
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	23	from typing import Tuple
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	24
				25	import numpy as np
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	26
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	27	from .api import NpuBlockTraversal
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	28	from .architecture_features import Accelerator
				29	from .architecture_features import ArchitectureFeatures
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	30	from .data_type import DataType
Louis Verhaard	7db7896	2020-05-25 15:05:26 +0200	[diff] [blame]	31	from .errors import UnsupportedFeatureError
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	32	from .numeric_util import round_up
				33	from .operation import NpuBlockType
Louis Verhaard	aee5d75	2020-09-30 09:01:52 +0200	[diff] [blame]	34	from .operation import Op
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	35	from .scaling import quantise_scale
				36	from .scaling import reduced_quantise_scale
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	37	from .tensor import Tensor
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	38	from .tensor import TensorFormat
				39	from .tensor import TensorPurpose
Raul Farkas	428a8d5	2023-01-16 16:52:18 +0000	[diff] [blame^]	40
				41	# Handle any errors thrown by NumPy while importing mlw_codec module
				42	try:
				43	from ethosu import mlw_codec
				44	except RuntimeError as ex:
				45	if "mlw_codec error: module compiled against API version" in str(ex):
				46	# Extract API versions from error message
				47	matches = [s for s in str(ex).split() if "0x" in s]
				48	if len(matches) == 2:
				49	# Raise new exception with more detailed message
				50	raise ImportError( # pylint: disable=W0707
				51	"NumPy C API version mismatch "
				52	f"(Build-time version: {matches[0]}, "
				53	f"Run-time version: {matches[1]})"
				54	"\nThis is a known issue most likely caused by a change in the API "
				55	"version in NumPy after installing ethos-u-vela.\nYou can find more "
				56	"information about the issue and possible solutions in the "
				57	"'Known Issues' section at https://review.mlplatform.org/"
				58	"plugins/gitiles/ml/ethos-u/ethos-u-vela/+/refs/heads/main/"
				59	"README.md#known-issues"
				60	)
				61	raise
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	62
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	63
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame]	64	# Contains meta info for a weight compression. If two tensors have identical weight compression config,
				65	# then they also will have identical compressed weights.
				66	WeightCompressionConfig = namedtuple(
Jonas Ohlsson	d857507	2022-03-30 10:30:25 +0200	[diff] [blame]	67	"WeightCompressionConfig",
				68	["npu_block_type", "ofm_block_depth", "ofm_depth_step", "dilation", "weight_value_id"],
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame]	69	)
				70
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	71	ScaleCompressionConfig = namedtuple("ScaleCompressionConfig", ["scale_value_id", "ifm_scale", "ofm_scale"])
				72
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	73	WeightKey = namedtuple("WeightKey", ["core", "depth"])
				74
				75
				76	class WeightRange:
				77	def __init__(self):
				78	self.offset = 0
				79	self.scale_bytes = 0
				80	self.weight_offset = 0
				81	self.weight_bytes = 0
				82	self.index = 0
				83
				84	@property
				85	def total_bytes(self):
				86	return self.scale_bytes + self.weight_bytes
				87
				88
				89	class NpuWeightTensor(Tensor):
				90	def __init__(self, name):
				91	Tensor.__init__(self, None, None, name + "_npu_encoded_weights")
				92	self.buffer = []
Rickard Bolin	fd8b500	2022-05-16 09:11:06 +0000	[diff] [blame]	93	self.double_buffer_sizes = [0, 0] # Required sizes if double buffering is used
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	94	self.encoded_ranges = OrderedDict()
				95	self.hw_traversal = NpuBlockTraversal.DEPTH_FIRST
				96	self.dtype = DataType.uint8
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	97	self.scale_compression_config = None
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	98
Rickard Bolin	fd8b500	2022-05-16 09:11:06 +0000	[diff] [blame]	99	def max_range_bytes(self):
				100	return max(self.double_buffer_sizes)
				101
				102	def double_buffer_size(self):
				103	"""Return total required size for double buffering"""
				104	return sum(self.double_buffer_sizes)
				105
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	106
				107	class CompressedWeightCache:
				108	"""Global tensor weight compression cache"""
				109
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	110	cache: Dict[WeightCompressionConfig, Tensor] = {}
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	111
				112	@staticmethod
				113	def get_tensor_with_same_compression(wcc):
				114	return CompressedWeightCache.cache.get(wcc)
				115
				116	@staticmethod
				117	def add(tens):
				118	# Adds the compressed weights from the tensor to the cache
				119	wcc = tens.weight_compression_config
				120	CompressedWeightCache.cache[wcc] = tens
				121
				122	@staticmethod
				123	def has_tensor_with_same_compression(wcc):
				124	return wcc in CompressedWeightCache.cache
				125
				126	@staticmethod
				127	def get_unencoded_size_with_same_compression(wcc):
				128	cache_obj = CompressedWeightCache.cache.get(wcc)
				129	return cache_obj[1] if cache_obj else None
				130
				131
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	132	def create_weight_compression_config(weight_tens, npu_block_type, ofm_block_depth, ofm_depth_step, dilation):
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	133	# Note: for an ofm block only its depth is used in weight compression.
				134	# And block depth > ofm depth gives same result as block depth == ofm depth
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	135	block_depth = min(ofm_block_depth, weight_tens.values.shape[-1])
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	136	return WeightCompressionConfig(npu_block_type, block_depth, ofm_depth_step, dilation, weight_tens.value_id)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	137
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame]	138
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	139	def encode_weights(
				140	accelerator: Accelerator,
				141	weights_volume: np.ndarray,
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	142	dilation_xy: Tuple[int, int],
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	143	ifm_bitdepth: int,
				144	ofm_block_depth: int,
				145	is_depthwise: bool,
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	146	block_traversal: NpuBlockTraversal,
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	147	):
				148	"""
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	149	Internal implementation of the public facing API to use weight encoding.
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	150
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	151	:param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	152	:param weights_volume: numpy.ndarray in OHWI layout with a shape of four
				153	:param dilation_xy: a two element tuple of dilation attributes in x,y dimension
				154	:param ifm_bitdepth: the bitdepth of input feature map
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	155	:param ofm_block_depth: the depth of blocks for Ethos-U processing
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	156	:param is_depthwise: a boolean indicating these weights are used for a depthwise traversal
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	157	:param block_traversal: indicates how these weights are traversed on sub-kernel basis
				158
Fredrik Svedberg	f5c07c4	2021-04-23 14:36:42 +0200	[diff] [blame]	159	:return: a tuple with a bytearray of encoded weights and the size of the unencoded weights
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	160	"""
Manupa Karunaratne	8b24f2b	2020-08-12 18:26:39 +0000	[diff] [blame]	161	# Check arg types
				162	assert isinstance(accelerator, Accelerator)
				163	assert isinstance(weights_volume, np.ndarray)
				164	assert isinstance(dilation_xy, tuple)
				165	assert isinstance(ifm_bitdepth, int)
				166	assert isinstance(ofm_block_depth, int)
				167	assert isinstance(is_depthwise, bool)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	168	assert isinstance(block_traversal, NpuBlockTraversal)
Manupa Karunaratne	8b24f2b	2020-08-12 18:26:39 +0000	[diff] [blame]	169
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	170	# Checks for weight layout
				171	assert len(weights_volume.shape) == 4, "weights ndarray should have a shape of 4"
				172
				173	# It cannot be both partkernel and depthwise
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	174	assert not (
				175	is_depthwise and block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST
				176	), "encode_weights :: partkernel and depthwise are mutually exclusive"
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	177
				178	# Check valid values for dilation
				179	assert dilation_xy[0] in (1, 2), "encode_weights :: dilation x should be 1 or 2 not {}".format(dilation_xy[0])
				180	assert dilation_xy[1] in (1, 2), "encode_weights :: dilation y should be 1 or 2 not {}".format(dilation_xy[1])
				181
				182	ifm_ublock = ArchitectureFeatures.accelerator_configs[accelerator].ifm_ublock
				183	ofm_ublock = ArchitectureFeatures.accelerator_configs[accelerator].ofm_ublock
James Peet	c244982	2021-07-19 17:09:16 +0100	[diff] [blame]	184	decomp_h = ArchitectureFeatures.SubKernelMax.height // dilation_xy[1]
				185	decomp_w = ArchitectureFeatures.SubKernelMax.width // dilation_xy[0]
Mauricio Briceno	67e11f7	2021-05-05 12:47:28 +0200	[diff] [blame]	186
				187	return mlw_codec.reorder_encode(
				188	ifm_ublock.depth,
				189	ofm_ublock.depth,
				190	weights_volume,
				191	ofm_block_depth,
				192	is_depthwise,
				193	block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST,
				194	ifm_bitdepth,
				195	decomp_h,
				196	decomp_w,
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	197	)
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	198
				199
Manupa Karunaratne	bef228b	2020-07-29 18:06:28 +0100	[diff] [blame]	200	def encode_bias(bias: np.int64, scale: int, shift: int):
				201	"""
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	202	Internal implementation of public facing API to pack bias and scale values as required by the Ethos-U
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	203
Manupa Karunaratne	bef228b	2020-07-29 18:06:28 +0100	[diff] [blame]	204	:param bias: 64bit signed number that includes 40bit signed bias
				205	:param scale: 32bit scale value
				206	:param shift: 6bit shift value
				207	:return: packed 80bit [0(2-bits),shift(6-bits),scale(32-bits),bias(40-bits)]
				208	"""
Manupa Karunaratne	8b24f2b	2020-08-12 18:26:39 +0000	[diff] [blame]	209	# Check arg types
				210	assert isinstance(bias, np.int64)
				211	assert isinstance(scale, int)
				212	assert isinstance(shift, int)
				213
Manupa Karunaratne	bef228b	2020-07-29 18:06:28 +0100	[diff] [blame]	214	assert -(1 << (40 - 1)) <= bias < (1 << (40 - 1)) # signed 40-bit range
				215	assert 0 <= scale < (1 << 32) # unsigned 32-bit range
				216	assert 0 <= shift < (1 << 6) # unsigned 6-bit range
				217
				218	data = bytearray(10)
				219	data[0] = (bias >> (0 * 8)) & 0xFF
				220	data[1] = (bias >> (1 * 8)) & 0xFF
				221	data[2] = (bias >> (2 * 8)) & 0xFF
				222	data[3] = (bias >> (3 * 8)) & 0xFF
				223	data[4] = (bias >> (4 * 8)) & 0xFF
				224	data[5] = (scale >> (0 * 8)) & 0xFF
				225	data[6] = (scale >> (1 * 8)) & 0xFF
				226	data[7] = (scale >> (2 * 8)) & 0xFF
				227	data[8] = (scale >> (3 * 8)) & 0xFF
				228	data[9] = shift & 0x3F
				229	return data
				230
				231
Tim Hall	f7e810a	2020-06-25 15:04:31 +0100	[diff] [blame]	232	def core_deinterleave(hwio, core, ncores):
				233	# Put weights back into OHWI
Jacob Bohlin	e843d33	2020-06-23 12:12:56 +0200	[diff] [blame]	234	ohwi = np.transpose(hwio, (3, 0, 1, 2))
				235	return ohwi[core : ohwi.shape[0] : ncores]
				236
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	237
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	238	def _prepare_scale_and_bias(arch, tens, rescale_for_faf, explicit_scaling):
Andreas Nevalainen	897cc14	2020-10-28 15:42:08 +0100	[diff] [blame]	239	assert tens.purpose in [TensorPurpose.FeatureMap, TensorPurpose.FSBias]
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	240	assert tens.format == TensorFormat.NHWC
				241	# the connected operator should expect a bias input unless it is a FullyConnected
Louis Verhaard	aee5d75	2020-09-30 09:01:52 +0200	[diff] [blame]	242	assert tens.consumer_list[0].type.needs_bias()
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	243	# the input bias tensor is the same as that connected to the operator
Louis Verhaard	aee5d75	2020-09-30 09:01:52 +0200	[diff] [blame]	244	bias_tens = tens.consumer_list[0].bias
Jacob Bohlin	cf7da10	2020-05-20 09:03:40 +0200	[diff] [blame]	245	assert tens is bias_tens
				246
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	247	# the operator should only have a single output
				248	assert len(tens.consumer_list[0].outputs) == 1
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	249	biases = tens.values
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	250
				251	first_consumer_op = tens.consumer_list[0]
				252	ifm_dtype = first_consumer_op.inputs[0].dtype
Dwight Lidman	4f728c0	2020-12-17 15:14:45 +0100	[diff] [blame]	253	ifm_scale = first_consumer_op.get_input_quantization().scale_f32
Louis Verhaard	98a3499	2020-09-01 10:39:04 +0200	[diff] [blame]	254	ofm_scale = first_consumer_op.get_output_quantization().scale_f32
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	255	weight_scales = first_consumer_op.inputs[1].quantization.scale_f32
				256
				257	# biases can have multiple consumers for rnn cells. if so, then check that they are all the same
				258	for op in tens.consumer_list[1:]:
Dwight Lidman	4f728c0	2020-12-17 15:14:45 +0100	[diff] [blame]	259	assert ifm_scale == op.get_input_quantization().scale_f32
Louis Verhaard	98a3499	2020-09-01 10:39:04 +0200	[diff] [blame]	260	assert ofm_scale == op.get_output_quantization().scale_f32
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	261	assert weight_scales == op.inputs[1].quantization.scale_f32
				262
				263	if not hasattr(weight_scales, "__iter__"):
				264	# If weight_scales is not already an iterable make it into a list
				265	weight_scales = [weight_scales]
				266
				267	# Convert scales to np.double (from np.float32) to conform to TensorFlow Lite which
				268	# uses double during scaling calculations
				269	# TensorFlow Lite casts the scales slightly differently for uint8 and int8
				270	if not rescale_for_faf:
				271	if ifm_dtype == DataType.uint8:
Dwight Lidman	4f728c0	2020-12-17 15:14:45 +0100	[diff] [blame]	272	# for some cases of the Mean operator, the scale must be calculated differently to match reference
				273	if first_consumer_op.low_precision_scaling:
				274	scales = [
				275	np.double(np.single(ifm_scale) / (np.single(weight_scale) * np.single(ofm_scale)))
				276	for weight_scale in weight_scales
				277	]
				278	else:
				279	scales = [np.double(ifm_scale * weight_scale) / np.double(ofm_scale) for weight_scale in weight_scales]
Fredrik Svedberg	d67c0aa	2020-03-30 13:15:28 +0200	[diff] [blame]	280	elif ifm_dtype == DataType.int8 or ifm_dtype == DataType.int16:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	281	scales = [
				282	(np.double(ifm_scale) * np.double(weight_scale)) / np.double(ofm_scale)
				283	for weight_scale in weight_scales
				284	]
				285	else:
Michael McGeagh	7a6f843	2020-12-02 15:29:22 +0000	[diff] [blame]	286	raise UnsupportedFeatureError(f"Compression of {ifm_dtype} is not implemented; Tensor: '{tens.name}'")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	287	else:
				288	if ifm_dtype == DataType.uint8:
				289	scales = [np.double(ifm_scale * weight_scale * 0x3000) for weight_scale in weight_scales]
Fredrik Svedberg	d67c0aa	2020-03-30 13:15:28 +0200	[diff] [blame]	290	elif ifm_dtype == DataType.int8 or ifm_dtype == DataType.int16:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	291	scales = [(np.double(ifm_scale * 0x3000) * np.double(weight_scale)) for weight_scale in weight_scales]
				292	else:
Michael McGeagh	7a6f843	2020-12-02 15:29:22 +0000	[diff] [blame]	293	raise UnsupportedFeatureError(f"Compression of {ifm_dtype} is not implemented; Tensor: '{tens.name}'")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	294
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	295	if explicit_scaling:
				296	assert len(explicit_scaling.shift) == len(explicit_scaling.multiplier)
				297	quantised_scales = [(int(m), int(s)) for s, m in zip(explicit_scaling.shift, explicit_scaling.multiplier)]
Fredrik Svedberg	d67c0aa	2020-03-30 13:15:28 +0200	[diff] [blame]	298	else:
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	299	# quantise all of the weight scales into (scale_factor, shift)
Fredrik Svedberg	cc219be	2022-09-20 16:32:52 +0200	[diff] [blame]	300	if ifm_dtype == DataType.int16 and bias_tens.dtype == DataType.int64:
				301	# Reference uses reduced scaling for int16 with int64 bias
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	302	quantised_scales = [reduced_quantise_scale(scale) for scale in scales]
				303	else:
				304	quantised_scales = [quantise_scale(scale) for scale in scales]
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	305
Rickard Bolin	fea1516	2022-07-04 16:19:16 +0000	[diff] [blame]	306	# Check the output quantisation to see if the scale value needs increasing to the next one
				307	if first_consumer_op.get_output_quantization().next_after:
				308	for i, quant_scale in enumerate(quantised_scales):
				309	q_scale, q_shift = quant_scale
				310	quantised_scales[i] = (q_scale + 1, q_shift)
				311
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	312	# If only 1 quantised scale is used, repeat that value for the length of the biases
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	313	if len(quantised_scales) == 1:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	314	quantised_scales = [quantised_scales[0]] * len(biases)
				315
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	316	return quantised_scales, biases
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	317
Jacob Bohlin	e843d33	2020-06-23 12:12:56 +0200	[diff] [blame]	318
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	319	def encode_weight_and_scale_tensor(
				320	arch, op, weight_tens, scale_tens, kernel, block_config, depth_offsets, rescale_for_faf=False
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	321	) -> Tuple[Optional[NpuWeightTensor], Optional[NpuWeightTensor]]:
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	322	npu_block_type = op.type.npu_block_type
				323
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	324	ifm_scale = scale_tens and scale_tens.consumer_list[0].get_input_quantization().scale_f32
				325	ofm_scale = scale_tens and scale_tens.consumer_list[0].get_output_quantization().scale_f32
				326
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	327	wcc = create_weight_compression_config(
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	328	weight_tens, npu_block_type, block_config.ofm_block.depth, hash(str(depth_offsets)), kernel.dilation
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	329	)
				330
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	331	scc = ScaleCompressionConfig(scale_tens and scale_tens.value_id, ifm_scale, ofm_scale)
				332
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	333	tens_cached = CompressedWeightCache.get_tensor_with_same_compression(wcc)
				334	if tens_cached is not None:
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	335	if tens_cached.scale_compression_config == scc:
				336	return tens_cached, None
				337	npu_tensor = NpuWeightTensor(scale_tens.name)
				338	do_weights = False
				339	do_scales = True
				340	else:
				341	npu_tensor = NpuWeightTensor(weight_tens.name)
				342	do_weights = True
				343	do_scales = True
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	344
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	345	npu_tensor.weight_compression_config = wcc
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	346	npu_tensor.scale_compression_config = scc
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	347
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	348	# Ensure depth offsets are terminated at end of OFM shape
				349	assert len(depth_offsets) > 1, "Require closed depth ranges"
				350
				351	ifm_bitdepth = op.inputs[0].dtype.size_in_bits()
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	352
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	353	# No cache hit, need to perform the encoding
				354	if do_weights:
				355	assert weight_tens.quantization is not None
Patrik Gustavsson	b081d67	2021-08-25 13:49:25 +0200	[diff] [blame]	356	assert weight_tens.quantization.scale_f32 is not None or op.explicit_scaling
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	357	assert weight_tens.quantization.zero_point is not None
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	358
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	359	# Early zero-point correction
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	360	quant_buf = weight_tens.values.astype(np.int16)
Tim Hall	b279844	2021-06-24 19:31:38 +0100	[diff] [blame]	361	# the zero point can be either a native or numpy type
				362	if isinstance(weight_tens.quantization.zero_point, (int, float)):
				363	zero_point = np.int16(weight_tens.quantization.zero_point)
				364	else:
				365	zero_point = weight_tens.quantization.zero_point.astype(np.int16)
				366	weights = quant_buf - zero_point
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	367
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	368	if len(weights.shape) == 2:
				369	weights = np.expand_dims(np.expand_dims(weights, axis=0), axis=0)
				370
				371	# Expect this (undilated) equivalence
				372	assert kernel.height == weights.shape[0]
				373	assert kernel.width == weights.shape[1]
				374
				375	ifm_depth = weights.shape[-2]
				376
				377	# Default HW traversal
				378	npu_tensor.hw_traversal = NpuBlockTraversal.DEPTH_FIRST
				379
				380	if npu_block_type == NpuBlockType.ConvolutionMxN:
				381	# Determine which block traversal strategy has better DPU utilization
				382	kernel_size = weights.shape[0] * weights.shape[1]
				383	depth_utilization = weights.shape[2] / round_up(weights.shape[2], 32 if ifm_bitdepth == 8 else 16)
				384	part_kernel_utilization = (weights.shape[2] / round_up(weights.shape[2], 8)) * (
				385	kernel_size / round_up(kernel_size, 4 if ifm_bitdepth == 8 else 2)
				386	)
				387	if part_kernel_utilization >= depth_utilization or ifm_depth <= 8:
				388	# Part-kernel first is always better for ifm depths <= 8
				389	npu_tensor.hw_traversal = NpuBlockTraversal.PART_KERNEL_FIRST
				390
				391	if op.type == Op.Conv2DBackpropInputSwitchedBias:
				392	# Transpose Convoluion, reverse weights in H and W axes
				393	weights = np.flip(weights, axis=(0, 1))
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	394
				395	encoded_stream = bytearray()
Rickard Bolin	fd8b500	2022-05-16 09:11:06 +0000	[diff] [blame]	396	double_buffer_sizes = [0, 0]
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	397	is_depthwise = npu_block_type == NpuBlockType.ConvolutionDepthWise
				398
				399	# Bias & scale
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	400	if do_scales:
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	401	quantised_scales, biases = _prepare_scale_and_bias(arch, scale_tens, rescale_for_faf, op.explicit_scaling)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	402	scale_tens.element_size_bytes = 10
				403
				404	# Slice the weight stream up depth-ways into bricks and compress
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	405	full_ofm_depth = weight_tens.values.shape[-1]
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	406	ofm_block_depth = block_config.ofm_block.depth
				407
				408	weight_range_index = 0
				409	for idx, depth_offset in enumerate(depth_offsets[:-1]):
				410	# Do not generate for offsets outside the OFM
				411	assert depth_offset >= 0 and depth_offset < full_ofm_depth
				412	depth_length = depth_offsets[idx + 1] - depth_offset
				413
				414	# Get the weights necessary for this brick
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	415	if do_weights:
				416	brick_weights = weights[:, :, :, depth_offset : depth_offset + depth_length]
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	417
				418	buffer_start_offset = len(encoded_stream)
				419
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	420	# For each core, deinterleave weights/scales from the larger volume
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	421	# and generate separate compressed streams.
				422	for core in range(0, min(arch.ncores, full_ofm_depth)):
				423
				424	core_block_depth = int((ofm_block_depth + arch.ncores - 1 - core) // arch.ncores)
				425
				426	if core_block_depth != 0:
				427	key = WeightKey(core, depth_offset)
				428	weight_range = WeightRange()
				429	weight_range.offset = len(encoded_stream)
				430	weight_range.index = weight_range_index
				431	weight_range_index += 1
				432
				433	# Scales & biases
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	434	if do_scales:
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	435	scale_stream = []
				436	core_scales = quantised_scales[
				437	depth_offset + core : depth_offset + core + depth_length : arch.ncores
				438	]
				439	core_biases = biases[depth_offset + core : depth_offset + core + depth_length : arch.ncores]
				440	for j, core_bias in enumerate(core_biases):
				441	scale_stream.extend(encode_bias(np.int64(core_bias), *core_scales[j]))
				442
				443	weight_range.scale_bytes = len(scale_stream)
				444
				445	encoded_stream.extend(scale_stream)
				446
				447	# Align to 16 for start of next substream
				448	remainder = len(encoded_stream) % 16
				449	if remainder > 0:
				450	encoded_stream.extend(bytearray(16 - remainder))
				451
				452	# Weights
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	453	if do_weights:
				454	core_weights = core_deinterleave(brick_weights, core, arch.ncores)
				455	encoded_substream, _ = encode_weights(
				456	accelerator=arch.accelerator_config,
				457	weights_volume=core_weights,
				458	dilation_xy=kernel.dilation,
				459	ifm_bitdepth=ifm_bitdepth,
				460	ofm_block_depth=core_block_depth,
				461	is_depthwise=is_depthwise,
				462	block_traversal=npu_tensor.hw_traversal,
				463	)
				464	weight_range.weight_offset = len(encoded_stream) - weight_range.offset
				465	weight_range.weight_bytes = len(encoded_substream)
				466	# Append encoded section
				467	encoded_stream.extend(encoded_substream)
				468	assert len(encoded_stream) % 16 == 0
Diqing Zhong	66d7ec0	2021-02-01 19:07:04 +0100	[diff] [blame]	469
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	470	# Record encoded range in tensor
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	471	npu_tensor.encoded_ranges[key] = weight_range
				472
				473	# Remember maximum encoded length for DoubleBuffering
Rickard Bolin	fd8b500	2022-05-16 09:11:06 +0000	[diff] [blame]	474	double_buffer_sizes[idx % 2] = max(double_buffer_sizes[idx % 2], len(encoded_stream) - buffer_start_offset)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	475
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	476	# Attach buffer to tensor
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	477	npu_tensor.buffer = encoded_stream
Rickard Bolin	fd8b500	2022-05-16 09:11:06 +0000	[diff] [blame]	478	npu_tensor.double_buffer_sizes = double_buffer_sizes
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	479	npu_tensor.set_all_shapes([1, 1, 1, len(encoded_stream)])
				480	npu_tensor.format = TensorFormat.WeightsCompressed
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	481
				482	# Scale only tensor
				483	if not do_weights:
				484	npu_tensor.weight_compression_config = None
				485	npu_tensor.purpose = TensorPurpose.FSBias
				486	npu_tensor.mem_area = scale_tens.mem_area
				487	npu_tensor.mem_type = scale_tens.mem_type
				488	weights_tensor = tens_cached
				489	scale_tensor = npu_tensor
				490	else:
				491	npu_tensor.purpose = TensorPurpose.Weights
				492	npu_tensor.mem_area = weight_tens.mem_area
				493	npu_tensor.mem_type = weight_tens.mem_type
				494	weights_tensor = npu_tensor
				495	scale_tensor = None
				496	CompressedWeightCache.add(weights_tensor)
				497
				498	return weights_tensor, scale_tensor