Blame - ethosu/vela/weight_compressor.py - ml/ethos-u/ethos-u-vela

blob: e3e318c32a50c0aa461650cd4c7cf9685a86e843 [file] [log] [blame]

Rickard Bolin	bc6ee58	2022-11-04 08:24:29 +0000	[diff] [blame^]	1	# SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Rickard Bolin	bc6ee58	2022-11-04 08:24:29 +0000	[diff] [blame^]	16	#
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	17	# Description:
				18	# Compresses and pads the weigths. It also calculates the scales and packs with the biases.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	19	from collections import namedtuple
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	20	from collections import OrderedDict
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	21	from typing import Dict
				22	from typing import Optional
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	23	from typing import Tuple
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	24
				25	import numpy as np
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	26
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	27	from .api import NpuBlockTraversal
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	28	from .architecture_features import Accelerator
				29	from .architecture_features import ArchitectureFeatures
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	30	from .data_type import DataType
Louis Verhaard	7db7896	2020-05-25 15:05:26 +0200	[diff] [blame]	31	from .errors import UnsupportedFeatureError
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	32	from .numeric_util import round_up
				33	from .operation import NpuBlockType
Louis Verhaard	aee5d75	2020-09-30 09:01:52 +0200	[diff] [blame]	34	from .operation import Op
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	35	from .scaling import quantise_scale
				36	from .scaling import reduced_quantise_scale
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	37	from .tensor import Tensor
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	38	from .tensor import TensorFormat
				39	from .tensor import TensorPurpose
Jacob Bohlin	e843d33	2020-06-23 12:12:56 +0200	[diff] [blame]	40	from ethosu import mlw_codec
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	41
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	42
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame]	43	# Contains meta info for a weight compression. If two tensors have identical weight compression config,
				44	# then they also will have identical compressed weights.
				45	WeightCompressionConfig = namedtuple(
Jonas Ohlsson	d857507	2022-03-30 10:30:25 +0200	[diff] [blame]	46	"WeightCompressionConfig",
				47	["npu_block_type", "ofm_block_depth", "ofm_depth_step", "dilation", "weight_value_id"],
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame]	48	)
				49
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	50	ScaleCompressionConfig = namedtuple("ScaleCompressionConfig", ["scale_value_id", "ifm_scale", "ofm_scale"])
				51
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	52	WeightKey = namedtuple("WeightKey", ["core", "depth"])
				53
				54
				55	class WeightRange:
				56	def __init__(self):
				57	self.offset = 0
				58	self.scale_bytes = 0
				59	self.weight_offset = 0
				60	self.weight_bytes = 0
				61	self.index = 0
				62
				63	@property
				64	def total_bytes(self):
				65	return self.scale_bytes + self.weight_bytes
				66
				67
				68	class NpuWeightTensor(Tensor):
				69	def __init__(self, name):
				70	Tensor.__init__(self, None, None, name + "_npu_encoded_weights")
				71	self.buffer = []
Rickard Bolin	fd8b500	2022-05-16 09:11:06 +0000	[diff] [blame]	72	self.double_buffer_sizes = [0, 0] # Required sizes if double buffering is used
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	73	self.encoded_ranges = OrderedDict()
				74	self.hw_traversal = NpuBlockTraversal.DEPTH_FIRST
				75	self.dtype = DataType.uint8
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	76	self.scale_compression_config = None
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	77
Rickard Bolin	fd8b500	2022-05-16 09:11:06 +0000	[diff] [blame]	78	def max_range_bytes(self):
				79	return max(self.double_buffer_sizes)
				80
				81	def double_buffer_size(self):
				82	"""Return total required size for double buffering"""
				83	return sum(self.double_buffer_sizes)
				84
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	85
				86	class CompressedWeightCache:
				87	"""Global tensor weight compression cache"""
				88
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	89	cache: Dict[WeightCompressionConfig, Tensor] = {}
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	90
				91	@staticmethod
				92	def get_tensor_with_same_compression(wcc):
				93	return CompressedWeightCache.cache.get(wcc)
				94
				95	@staticmethod
				96	def add(tens):
				97	# Adds the compressed weights from the tensor to the cache
				98	wcc = tens.weight_compression_config
				99	CompressedWeightCache.cache[wcc] = tens
				100
				101	@staticmethod
				102	def has_tensor_with_same_compression(wcc):
				103	return wcc in CompressedWeightCache.cache
				104
				105	@staticmethod
				106	def get_unencoded_size_with_same_compression(wcc):
				107	cache_obj = CompressedWeightCache.cache.get(wcc)
				108	return cache_obj[1] if cache_obj else None
				109
				110
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	111	def create_weight_compression_config(weight_tens, npu_block_type, ofm_block_depth, ofm_depth_step, dilation):
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	112	# Note: for an ofm block only its depth is used in weight compression.
				113	# And block depth > ofm depth gives same result as block depth == ofm depth
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	114	block_depth = min(ofm_block_depth, weight_tens.values.shape[-1])
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	115	return WeightCompressionConfig(npu_block_type, block_depth, ofm_depth_step, dilation, weight_tens.value_id)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	116
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame]	117
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	118	def encode_weights(
				119	accelerator: Accelerator,
				120	weights_volume: np.ndarray,
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	121	dilation_xy: Tuple[int, int],
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	122	ifm_bitdepth: int,
				123	ofm_block_depth: int,
				124	is_depthwise: bool,
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	125	block_traversal: NpuBlockTraversal,
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	126	):
				127	"""
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	128	Internal implementation of the public facing API to use weight encoding.
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	129
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	130	:param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	131	:param weights_volume: numpy.ndarray in OHWI layout with a shape of four
				132	:param dilation_xy: a two element tuple of dilation attributes in x,y dimension
				133	:param ifm_bitdepth: the bitdepth of input feature map
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	134	:param ofm_block_depth: the depth of blocks for Ethos-U processing
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	135	:param is_depthwise: a boolean indicating these weights are used for a depthwise traversal
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	136	:param block_traversal: indicates how these weights are traversed on sub-kernel basis
				137
Fredrik Svedberg	f5c07c4	2021-04-23 14:36:42 +0200	[diff] [blame]	138	:return: a tuple with a bytearray of encoded weights and the size of the unencoded weights
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	139	"""
Manupa Karunaratne	8b24f2b	2020-08-12 18:26:39 +0000	[diff] [blame]	140	# Check arg types
				141	assert isinstance(accelerator, Accelerator)
				142	assert isinstance(weights_volume, np.ndarray)
				143	assert isinstance(dilation_xy, tuple)
				144	assert isinstance(ifm_bitdepth, int)
				145	assert isinstance(ofm_block_depth, int)
				146	assert isinstance(is_depthwise, bool)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	147	assert isinstance(block_traversal, NpuBlockTraversal)
Manupa Karunaratne	8b24f2b	2020-08-12 18:26:39 +0000	[diff] [blame]	148
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	149	# Checks for weight layout
				150	assert len(weights_volume.shape) == 4, "weights ndarray should have a shape of 4"
				151
				152	# It cannot be both partkernel and depthwise
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	153	assert not (
				154	is_depthwise and block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST
				155	), "encode_weights :: partkernel and depthwise are mutually exclusive"
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	156
				157	# Check valid values for dilation
				158	assert dilation_xy[0] in (1, 2), "encode_weights :: dilation x should be 1 or 2 not {}".format(dilation_xy[0])
				159	assert dilation_xy[1] in (1, 2), "encode_weights :: dilation y should be 1 or 2 not {}".format(dilation_xy[1])
				160
				161	ifm_ublock = ArchitectureFeatures.accelerator_configs[accelerator].ifm_ublock
				162	ofm_ublock = ArchitectureFeatures.accelerator_configs[accelerator].ofm_ublock
James Peet	c244982	2021-07-19 17:09:16 +0100	[diff] [blame]	163	decomp_h = ArchitectureFeatures.SubKernelMax.height // dilation_xy[1]
				164	decomp_w = ArchitectureFeatures.SubKernelMax.width // dilation_xy[0]
Mauricio Briceno	67e11f7	2021-05-05 12:47:28 +0200	[diff] [blame]	165
				166	return mlw_codec.reorder_encode(
				167	ifm_ublock.depth,
				168	ofm_ublock.depth,
				169	weights_volume,
				170	ofm_block_depth,
				171	is_depthwise,
				172	block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST,
				173	ifm_bitdepth,
				174	decomp_h,
				175	decomp_w,
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	176	)
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	177
				178
Manupa Karunaratne	bef228b	2020-07-29 18:06:28 +0100	[diff] [blame]	179	def encode_bias(bias: np.int64, scale: int, shift: int):
				180	"""
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	181	Internal implementation of public facing API to pack bias and scale values as required by the Ethos-U
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	182
Manupa Karunaratne	bef228b	2020-07-29 18:06:28 +0100	[diff] [blame]	183	:param bias: 64bit signed number that includes 40bit signed bias
				184	:param scale: 32bit scale value
				185	:param shift: 6bit shift value
				186	:return: packed 80bit [0(2-bits),shift(6-bits),scale(32-bits),bias(40-bits)]
				187	"""
Manupa Karunaratne	8b24f2b	2020-08-12 18:26:39 +0000	[diff] [blame]	188	# Check arg types
				189	assert isinstance(bias, np.int64)
				190	assert isinstance(scale, int)
				191	assert isinstance(shift, int)
				192
Manupa Karunaratne	bef228b	2020-07-29 18:06:28 +0100	[diff] [blame]	193	assert -(1 << (40 - 1)) <= bias < (1 << (40 - 1)) # signed 40-bit range
				194	assert 0 <= scale < (1 << 32) # unsigned 32-bit range
				195	assert 0 <= shift < (1 << 6) # unsigned 6-bit range
				196
				197	data = bytearray(10)
				198	data[0] = (bias >> (0 * 8)) & 0xFF
				199	data[1] = (bias >> (1 * 8)) & 0xFF
				200	data[2] = (bias >> (2 * 8)) & 0xFF
				201	data[3] = (bias >> (3 * 8)) & 0xFF
				202	data[4] = (bias >> (4 * 8)) & 0xFF
				203	data[5] = (scale >> (0 * 8)) & 0xFF
				204	data[6] = (scale >> (1 * 8)) & 0xFF
				205	data[7] = (scale >> (2 * 8)) & 0xFF
				206	data[8] = (scale >> (3 * 8)) & 0xFF
				207	data[9] = shift & 0x3F
				208	return data
				209
				210
Tim Hall	f7e810a	2020-06-25 15:04:31 +0100	[diff] [blame]	211	def core_deinterleave(hwio, core, ncores):
				212	# Put weights back into OHWI
Jacob Bohlin	e843d33	2020-06-23 12:12:56 +0200	[diff] [blame]	213	ohwi = np.transpose(hwio, (3, 0, 1, 2))
				214	return ohwi[core : ohwi.shape[0] : ncores]
				215
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	216
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	217	def _prepare_scale_and_bias(arch, tens, rescale_for_faf, explicit_scaling):
Andreas Nevalainen	897cc14	2020-10-28 15:42:08 +0100	[diff] [blame]	218	assert tens.purpose in [TensorPurpose.FeatureMap, TensorPurpose.FSBias]
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	219	assert tens.format == TensorFormat.NHWC
				220	# the connected operator should expect a bias input unless it is a FullyConnected
Louis Verhaard	aee5d75	2020-09-30 09:01:52 +0200	[diff] [blame]	221	assert tens.consumer_list[0].type.needs_bias()
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	222	# the input bias tensor is the same as that connected to the operator
Louis Verhaard	aee5d75	2020-09-30 09:01:52 +0200	[diff] [blame]	223	bias_tens = tens.consumer_list[0].bias
Jacob Bohlin	cf7da10	2020-05-20 09:03:40 +0200	[diff] [blame]	224	assert tens is bias_tens
				225
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	226	# the operator should only have a single output
				227	assert len(tens.consumer_list[0].outputs) == 1
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	228	biases = tens.values
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	229
				230	first_consumer_op = tens.consumer_list[0]
				231	ifm_dtype = first_consumer_op.inputs[0].dtype
Dwight Lidman	4f728c0	2020-12-17 15:14:45 +0100	[diff] [blame]	232	ifm_scale = first_consumer_op.get_input_quantization().scale_f32
Louis Verhaard	98a3499	2020-09-01 10:39:04 +0200	[diff] [blame]	233	ofm_scale = first_consumer_op.get_output_quantization().scale_f32
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	234	weight_scales = first_consumer_op.inputs[1].quantization.scale_f32
				235
				236	# biases can have multiple consumers for rnn cells. if so, then check that they are all the same
				237	for op in tens.consumer_list[1:]:
Dwight Lidman	4f728c0	2020-12-17 15:14:45 +0100	[diff] [blame]	238	assert ifm_scale == op.get_input_quantization().scale_f32
Louis Verhaard	98a3499	2020-09-01 10:39:04 +0200	[diff] [blame]	239	assert ofm_scale == op.get_output_quantization().scale_f32
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	240	assert weight_scales == op.inputs[1].quantization.scale_f32
				241
				242	if not hasattr(weight_scales, "__iter__"):
				243	# If weight_scales is not already an iterable make it into a list
				244	weight_scales = [weight_scales]
				245
				246	# Convert scales to np.double (from np.float32) to conform to TensorFlow Lite which
				247	# uses double during scaling calculations
				248	# TensorFlow Lite casts the scales slightly differently for uint8 and int8
				249	if not rescale_for_faf:
				250	if ifm_dtype == DataType.uint8:
Dwight Lidman	4f728c0	2020-12-17 15:14:45 +0100	[diff] [blame]	251	# for some cases of the Mean operator, the scale must be calculated differently to match reference
				252	if first_consumer_op.low_precision_scaling:
				253	scales = [
				254	np.double(np.single(ifm_scale) / (np.single(weight_scale) * np.single(ofm_scale)))
				255	for weight_scale in weight_scales
				256	]
				257	else:
				258	scales = [np.double(ifm_scale * weight_scale) / np.double(ofm_scale) for weight_scale in weight_scales]
Fredrik Svedberg	d67c0aa	2020-03-30 13:15:28 +0200	[diff] [blame]	259	elif ifm_dtype == DataType.int8 or ifm_dtype == DataType.int16:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	260	scales = [
				261	(np.double(ifm_scale) * np.double(weight_scale)) / np.double(ofm_scale)
				262	for weight_scale in weight_scales
				263	]
				264	else:
Michael McGeagh	7a6f843	2020-12-02 15:29:22 +0000	[diff] [blame]	265	raise UnsupportedFeatureError(f"Compression of {ifm_dtype} is not implemented; Tensor: '{tens.name}'")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	266	else:
				267	if ifm_dtype == DataType.uint8:
				268	scales = [np.double(ifm_scale * weight_scale * 0x3000) for weight_scale in weight_scales]
Fredrik Svedberg	d67c0aa	2020-03-30 13:15:28 +0200	[diff] [blame]	269	elif ifm_dtype == DataType.int8 or ifm_dtype == DataType.int16:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	270	scales = [(np.double(ifm_scale * 0x3000) * np.double(weight_scale)) for weight_scale in weight_scales]
				271	else:
Michael McGeagh	7a6f843	2020-12-02 15:29:22 +0000	[diff] [blame]	272	raise UnsupportedFeatureError(f"Compression of {ifm_dtype} is not implemented; Tensor: '{tens.name}'")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	273
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	274	if explicit_scaling:
				275	assert len(explicit_scaling.shift) == len(explicit_scaling.multiplier)
				276	quantised_scales = [(int(m), int(s)) for s, m in zip(explicit_scaling.shift, explicit_scaling.multiplier)]
Fredrik Svedberg	d67c0aa	2020-03-30 13:15:28 +0200	[diff] [blame]	277	else:
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	278	# quantise all of the weight scales into (scale_factor, shift)
Fredrik Svedberg	cc219be	2022-09-20 16:32:52 +0200	[diff] [blame]	279	if ifm_dtype == DataType.int16 and bias_tens.dtype == DataType.int64:
				280	# Reference uses reduced scaling for int16 with int64 bias
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	281	quantised_scales = [reduced_quantise_scale(scale) for scale in scales]
				282	else:
				283	quantised_scales = [quantise_scale(scale) for scale in scales]
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	284
Rickard Bolin	fea1516	2022-07-04 16:19:16 +0000	[diff] [blame]	285	# Check the output quantisation to see if the scale value needs increasing to the next one
				286	if first_consumer_op.get_output_quantization().next_after:
				287	for i, quant_scale in enumerate(quantised_scales):
				288	q_scale, q_shift = quant_scale
				289	quantised_scales[i] = (q_scale + 1, q_shift)
				290
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	291	# If only 1 quantised scale is used, repeat that value for the length of the biases
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	292	if len(quantised_scales) == 1:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	293	quantised_scales = [quantised_scales[0]] * len(biases)
				294
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	295	return quantised_scales, biases
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	296
Jacob Bohlin	e843d33	2020-06-23 12:12:56 +0200	[diff] [blame]	297
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	298	def encode_weight_and_scale_tensor(
				299	arch, op, weight_tens, scale_tens, kernel, block_config, depth_offsets, rescale_for_faf=False
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	300	) -> Tuple[Optional[NpuWeightTensor], Optional[NpuWeightTensor]]:
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	301	npu_block_type = op.type.npu_block_type
				302
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	303	ifm_scale = scale_tens and scale_tens.consumer_list[0].get_input_quantization().scale_f32
				304	ofm_scale = scale_tens and scale_tens.consumer_list[0].get_output_quantization().scale_f32
				305
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	306	wcc = create_weight_compression_config(
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	307	weight_tens, npu_block_type, block_config.ofm_block.depth, hash(str(depth_offsets)), kernel.dilation
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	308	)
				309
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	310	scc = ScaleCompressionConfig(scale_tens and scale_tens.value_id, ifm_scale, ofm_scale)
				311
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	312	tens_cached = CompressedWeightCache.get_tensor_with_same_compression(wcc)
				313	if tens_cached is not None:
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	314	if tens_cached.scale_compression_config == scc:
				315	return tens_cached, None
				316	npu_tensor = NpuWeightTensor(scale_tens.name)
				317	do_weights = False
				318	do_scales = True
				319	else:
				320	npu_tensor = NpuWeightTensor(weight_tens.name)
				321	do_weights = True
				322	do_scales = True
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	323
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	324	npu_tensor.weight_compression_config = wcc
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	325	npu_tensor.scale_compression_config = scc
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	326
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	327	# Ensure depth offsets are terminated at end of OFM shape
				328	assert len(depth_offsets) > 1, "Require closed depth ranges"
				329
				330	ifm_bitdepth = op.inputs[0].dtype.size_in_bits()
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	331
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	332	# No cache hit, need to perform the encoding
				333	if do_weights:
				334	assert weight_tens.quantization is not None
Patrik Gustavsson	b081d67	2021-08-25 13:49:25 +0200	[diff] [blame]	335	assert weight_tens.quantization.scale_f32 is not None or op.explicit_scaling
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	336	assert weight_tens.quantization.zero_point is not None
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	337
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	338	# Early zero-point correction
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	339	quant_buf = weight_tens.values.astype(np.int16)
Tim Hall	b279844	2021-06-24 19:31:38 +0100	[diff] [blame]	340	# the zero point can be either a native or numpy type
				341	if isinstance(weight_tens.quantization.zero_point, (int, float)):
				342	zero_point = np.int16(weight_tens.quantization.zero_point)
				343	else:
				344	zero_point = weight_tens.quantization.zero_point.astype(np.int16)
				345	weights = quant_buf - zero_point
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	346
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	347	if len(weights.shape) == 2:
				348	weights = np.expand_dims(np.expand_dims(weights, axis=0), axis=0)
				349
				350	# Expect this (undilated) equivalence
				351	assert kernel.height == weights.shape[0]
				352	assert kernel.width == weights.shape[1]
				353
				354	ifm_depth = weights.shape[-2]
				355
				356	# Default HW traversal
				357	npu_tensor.hw_traversal = NpuBlockTraversal.DEPTH_FIRST
				358
				359	if npu_block_type == NpuBlockType.ConvolutionMxN:
				360	# Determine which block traversal strategy has better DPU utilization
				361	kernel_size = weights.shape[0] * weights.shape[1]
				362	depth_utilization = weights.shape[2] / round_up(weights.shape[2], 32 if ifm_bitdepth == 8 else 16)
				363	part_kernel_utilization = (weights.shape[2] / round_up(weights.shape[2], 8)) * (
				364	kernel_size / round_up(kernel_size, 4 if ifm_bitdepth == 8 else 2)
				365	)
				366	if part_kernel_utilization >= depth_utilization or ifm_depth <= 8:
				367	# Part-kernel first is always better for ifm depths <= 8
				368	npu_tensor.hw_traversal = NpuBlockTraversal.PART_KERNEL_FIRST
				369
				370	if op.type == Op.Conv2DBackpropInputSwitchedBias:
				371	# Transpose Convoluion, reverse weights in H and W axes
				372	weights = np.flip(weights, axis=(0, 1))
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	373
				374	encoded_stream = bytearray()
Rickard Bolin	fd8b500	2022-05-16 09:11:06 +0000	[diff] [blame]	375	double_buffer_sizes = [0, 0]
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	376	is_depthwise = npu_block_type == NpuBlockType.ConvolutionDepthWise
				377
				378	# Bias & scale
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	379	if do_scales:
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	380	quantised_scales, biases = _prepare_scale_and_bias(arch, scale_tens, rescale_for_faf, op.explicit_scaling)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	381	scale_tens.element_size_bytes = 10
				382
				383	# Slice the weight stream up depth-ways into bricks and compress
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	384	full_ofm_depth = weight_tens.values.shape[-1]
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	385	ofm_block_depth = block_config.ofm_block.depth
				386
				387	weight_range_index = 0
				388	for idx, depth_offset in enumerate(depth_offsets[:-1]):
				389	# Do not generate for offsets outside the OFM
				390	assert depth_offset >= 0 and depth_offset < full_ofm_depth
				391	depth_length = depth_offsets[idx + 1] - depth_offset
				392
				393	# Get the weights necessary for this brick
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	394	if do_weights:
				395	brick_weights = weights[:, :, :, depth_offset : depth_offset + depth_length]
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	396
				397	buffer_start_offset = len(encoded_stream)
				398
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	399	# For each core, deinterleave weights/scales from the larger volume
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	400	# and generate separate compressed streams.
				401	for core in range(0, min(arch.ncores, full_ofm_depth)):
				402
				403	core_block_depth = int((ofm_block_depth + arch.ncores - 1 - core) // arch.ncores)
				404
				405	if core_block_depth != 0:
				406	key = WeightKey(core, depth_offset)
				407	weight_range = WeightRange()
				408	weight_range.offset = len(encoded_stream)
				409	weight_range.index = weight_range_index
				410	weight_range_index += 1
				411
				412	# Scales & biases
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	413	if do_scales:
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	414	scale_stream = []
				415	core_scales = quantised_scales[
				416	depth_offset + core : depth_offset + core + depth_length : arch.ncores
				417	]
				418	core_biases = biases[depth_offset + core : depth_offset + core + depth_length : arch.ncores]
				419	for j, core_bias in enumerate(core_biases):
				420	scale_stream.extend(encode_bias(np.int64(core_bias), *core_scales[j]))
				421
				422	weight_range.scale_bytes = len(scale_stream)
				423
				424	encoded_stream.extend(scale_stream)
				425
				426	# Align to 16 for start of next substream
				427	remainder = len(encoded_stream) % 16
				428	if remainder > 0:
				429	encoded_stream.extend(bytearray(16 - remainder))
				430
				431	# Weights
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	432	if do_weights:
				433	core_weights = core_deinterleave(brick_weights, core, arch.ncores)
				434	encoded_substream, _ = encode_weights(
				435	accelerator=arch.accelerator_config,
				436	weights_volume=core_weights,
				437	dilation_xy=kernel.dilation,
				438	ifm_bitdepth=ifm_bitdepth,
				439	ofm_block_depth=core_block_depth,
				440	is_depthwise=is_depthwise,
				441	block_traversal=npu_tensor.hw_traversal,
				442	)
				443	weight_range.weight_offset = len(encoded_stream) - weight_range.offset
				444	weight_range.weight_bytes = len(encoded_substream)
				445	# Append encoded section
				446	encoded_stream.extend(encoded_substream)
				447	assert len(encoded_stream) % 16 == 0
Diqing Zhong	66d7ec0	2021-02-01 19:07:04 +0100	[diff] [blame]	448
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	449	# Record encoded range in tensor
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	450	npu_tensor.encoded_ranges[key] = weight_range
				451
				452	# Remember maximum encoded length for DoubleBuffering
Rickard Bolin	fd8b500	2022-05-16 09:11:06 +0000	[diff] [blame]	453	double_buffer_sizes[idx % 2] = max(double_buffer_sizes[idx % 2], len(encoded_stream) - buffer_start_offset)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	454
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	455	# Attach buffer to tensor
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	456	npu_tensor.buffer = encoded_stream
Rickard Bolin	fd8b500	2022-05-16 09:11:06 +0000	[diff] [blame]	457	npu_tensor.double_buffer_sizes = double_buffer_sizes
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	458	npu_tensor.set_all_shapes([1, 1, 1, len(encoded_stream)])
				459	npu_tensor.format = TensorFormat.WeightsCompressed
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	460
				461	# Scale only tensor
				462	if not do_weights:
				463	npu_tensor.weight_compression_config = None
				464	npu_tensor.purpose = TensorPurpose.FSBias
				465	npu_tensor.mem_area = scale_tens.mem_area
				466	npu_tensor.mem_type = scale_tens.mem_type
				467	weights_tensor = tens_cached
				468	scale_tensor = npu_tensor
				469	else:
				470	npu_tensor.purpose = TensorPurpose.Weights
				471	npu_tensor.mem_area = weight_tens.mem_area
				472	npu_tensor.mem_type = weight_tens.mem_type
				473	weights_tensor = npu_tensor
				474	scale_tensor = None
				475	CompressedWeightCache.add(weights_tensor)
				476
				477	return weights_tensor, scale_tensor