Blame - ethosu/vela/weight_compressor.py - ml/ethos-u/ethos-u-vela

blob: 688170356648d60cc393b9c745d6260eb1268519 [file] [log] [blame]

erik.andersson@arm.com	460c689	2021-02-24 14:38:09 +0100	[diff] [blame]	1	# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	16	# Description:
				17	# Compresses and pads the weigths. It also calculates the scales and packs with the biases.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	18	from collections import namedtuple
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	19	from collections import OrderedDict
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	20	from typing import Tuple
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	21
				22	import numpy as np
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	23
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	24	from .api import NpuBlockTraversal
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	25	from .architecture_features import Accelerator
				26	from .architecture_features import ArchitectureFeatures
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	27	from .data_type import DataType
Louis Verhaard	7db7896	2020-05-25 15:05:26 +0200	[diff] [blame]	28	from .errors import UnsupportedFeatureError
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	29	from .numeric_util import round_up
				30	from .operation import NpuBlockType
Louis Verhaard	aee5d75	2020-09-30 09:01:52 +0200	[diff] [blame]	31	from .operation import Op
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	32	from .scaling import quantise_scale
				33	from .scaling import reduced_quantise_scale
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	34	from .tensor import Tensor
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	35	from .tensor import TensorFormat
				36	from .tensor import TensorPurpose
Jacob Bohlin	e843d33	2020-06-23 12:12:56 +0200	[diff] [blame]	37	from ethosu import mlw_codec
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	38
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	39
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame]	40	# Contains meta info for a weight compression. If two tensors have identical weight compression config,
				41	# then they also will have identical compressed weights.
				42	WeightCompressionConfig = namedtuple(
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	43	"WeightCompressionConfig", ["npu_block_type", "ofm_block_depth", "ofm_depth_step", "dilation", "weight_value_id"],
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame]	44	)
				45
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	46	ScaleCompressionConfig = namedtuple("ScaleCompressionConfig", ["scale_value_id", "ifm_scale", "ofm_scale"])
				47
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	48	WeightKey = namedtuple("WeightKey", ["core", "depth"])
				49
				50
				51	class WeightRange:
				52	def __init__(self):
				53	self.offset = 0
				54	self.scale_bytes = 0
				55	self.weight_offset = 0
				56	self.weight_bytes = 0
				57	self.index = 0
				58
				59	@property
				60	def total_bytes(self):
				61	return self.scale_bytes + self.weight_bytes
				62
				63
				64	class NpuWeightTensor(Tensor):
				65	def __init__(self, name):
				66	Tensor.__init__(self, None, None, name + "_npu_encoded_weights")
				67	self.buffer = []
				68	self.max_range_bytes = 0
				69	self.encoded_ranges = OrderedDict()
				70	self.hw_traversal = NpuBlockTraversal.DEPTH_FIRST
				71	self.dtype = DataType.uint8
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	72	self.scale_compression_config = None
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	73
				74
				75	class CompressedWeightCache:
				76	"""Global tensor weight compression cache"""
				77
				78	cache = {}
				79
				80	@staticmethod
				81	def get_tensor_with_same_compression(wcc):
				82	return CompressedWeightCache.cache.get(wcc)
				83
				84	@staticmethod
				85	def add(tens):
				86	# Adds the compressed weights from the tensor to the cache
				87	wcc = tens.weight_compression_config
				88	CompressedWeightCache.cache[wcc] = tens
				89
				90	@staticmethod
				91	def has_tensor_with_same_compression(wcc):
				92	return wcc in CompressedWeightCache.cache
				93
				94	@staticmethod
				95	def get_unencoded_size_with_same_compression(wcc):
				96	cache_obj = CompressedWeightCache.cache.get(wcc)
				97	return cache_obj[1] if cache_obj else None
				98
				99
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	100	def create_weight_compression_config(weight_tens, npu_block_type, ofm_block_depth, ofm_depth_step, dilation):
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	101	# Note: for an ofm block only its depth is used in weight compression.
				102	# And block depth > ofm depth gives same result as block depth == ofm depth
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	103	block_depth = min(ofm_block_depth, weight_tens.values.shape[-1])
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	104	return WeightCompressionConfig(npu_block_type, block_depth, ofm_depth_step, dilation, weight_tens.value_id)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	105
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame]	106
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	107	def encode_weights(
				108	accelerator: Accelerator,
				109	weights_volume: np.ndarray,
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	110	dilation_xy: Tuple[int, int],
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	111	ifm_bitdepth: int,
				112	ofm_block_depth: int,
				113	is_depthwise: bool,
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	114	block_traversal: NpuBlockTraversal,
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	115	):
				116	"""
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	117	Internal implementation of the public facing API to use weight encoding.
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	118
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	119	:param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	120	:param weights_volume: numpy.ndarray in OHWI layout with a shape of four
				121	:param dilation_xy: a two element tuple of dilation attributes in x,y dimension
				122	:param ifm_bitdepth: the bitdepth of input feature map
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	123	:param ofm_block_depth: the depth of blocks for Ethos-U processing
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	124	:param is_depthwise: a boolean indicating these weights are used for a depthwise traversal
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	125	:param block_traversal: indicates how these weights are traversed on sub-kernel basis
				126
Fredrik Svedberg	f5c07c4	2021-04-23 14:36:42 +0200	[diff] [blame]	127	:return: a tuple with a bytearray of encoded weights and the size of the unencoded weights
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	128	"""
Manupa Karunaratne	8b24f2b	2020-08-12 18:26:39 +0000	[diff] [blame]	129	# Check arg types
				130	assert isinstance(accelerator, Accelerator)
				131	assert isinstance(weights_volume, np.ndarray)
				132	assert isinstance(dilation_xy, tuple)
				133	assert isinstance(ifm_bitdepth, int)
				134	assert isinstance(ofm_block_depth, int)
				135	assert isinstance(is_depthwise, bool)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	136	assert isinstance(block_traversal, NpuBlockTraversal)
Manupa Karunaratne	8b24f2b	2020-08-12 18:26:39 +0000	[diff] [blame]	137
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	138	# Checks for weight layout
				139	assert len(weights_volume.shape) == 4, "weights ndarray should have a shape of 4"
				140
				141	# It cannot be both partkernel and depthwise
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	142	assert not (
				143	is_depthwise and block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST
				144	), "encode_weights :: partkernel and depthwise are mutually exclusive"
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	145
				146	# Check valid values for dilation
				147	assert dilation_xy[0] in (1, 2), "encode_weights :: dilation x should be 1 or 2 not {}".format(dilation_xy[0])
				148	assert dilation_xy[1] in (1, 2), "encode_weights :: dilation y should be 1 or 2 not {}".format(dilation_xy[1])
				149
				150	ifm_ublock = ArchitectureFeatures.accelerator_configs[accelerator].ifm_ublock
				151	ofm_ublock = ArchitectureFeatures.accelerator_configs[accelerator].ofm_ublock
James Peet	c244982	2021-07-19 17:09:16 +0100	[diff] [blame]	152	decomp_h = ArchitectureFeatures.SubKernelMax.height // dilation_xy[1]
				153	decomp_w = ArchitectureFeatures.SubKernelMax.width // dilation_xy[0]
Mauricio Briceno	67e11f7	2021-05-05 12:47:28 +0200	[diff] [blame]	154
				155	return mlw_codec.reorder_encode(
				156	ifm_ublock.depth,
				157	ofm_ublock.depth,
				158	weights_volume,
				159	ofm_block_depth,
				160	is_depthwise,
				161	block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST,
				162	ifm_bitdepth,
				163	decomp_h,
				164	decomp_w,
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	165	)
Manupa Karunaratne	d83d2e1	2020-07-20 12:05:32 +0100	[diff] [blame]	166
				167
Manupa Karunaratne	bef228b	2020-07-29 18:06:28 +0100	[diff] [blame]	168	def encode_bias(bias: np.int64, scale: int, shift: int):
				169	"""
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	170	Internal implementation of public facing API to pack bias and scale values as required by the Ethos-U
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	171
Manupa Karunaratne	bef228b	2020-07-29 18:06:28 +0100	[diff] [blame]	172	:param bias: 64bit signed number that includes 40bit signed bias
				173	:param scale: 32bit scale value
				174	:param shift: 6bit shift value
				175	:return: packed 80bit [0(2-bits),shift(6-bits),scale(32-bits),bias(40-bits)]
				176	"""
Manupa Karunaratne	8b24f2b	2020-08-12 18:26:39 +0000	[diff] [blame]	177	# Check arg types
				178	assert isinstance(bias, np.int64)
				179	assert isinstance(scale, int)
				180	assert isinstance(shift, int)
				181
Manupa Karunaratne	bef228b	2020-07-29 18:06:28 +0100	[diff] [blame]	182	assert -(1 << (40 - 1)) <= bias < (1 << (40 - 1)) # signed 40-bit range
				183	assert 0 <= scale < (1 << 32) # unsigned 32-bit range
				184	assert 0 <= shift < (1 << 6) # unsigned 6-bit range
				185
				186	data = bytearray(10)
				187	data[0] = (bias >> (0 * 8)) & 0xFF
				188	data[1] = (bias >> (1 * 8)) & 0xFF
				189	data[2] = (bias >> (2 * 8)) & 0xFF
				190	data[3] = (bias >> (3 * 8)) & 0xFF
				191	data[4] = (bias >> (4 * 8)) & 0xFF
				192	data[5] = (scale >> (0 * 8)) & 0xFF
				193	data[6] = (scale >> (1 * 8)) & 0xFF
				194	data[7] = (scale >> (2 * 8)) & 0xFF
				195	data[8] = (scale >> (3 * 8)) & 0xFF
				196	data[9] = shift & 0x3F
				197	return data
				198
				199
Tim Hall	f7e810a	2020-06-25 15:04:31 +0100	[diff] [blame]	200	def core_deinterleave(hwio, core, ncores):
				201	# Put weights back into OHWI
Jacob Bohlin	e843d33	2020-06-23 12:12:56 +0200	[diff] [blame]	202	ohwi = np.transpose(hwio, (3, 0, 1, 2))
				203	return ohwi[core : ohwi.shape[0] : ncores]
				204
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	205
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	206	def _prepare_scale_and_bias(arch, tens, rescale_for_faf, explicit_scaling):
Andreas Nevalainen	897cc14	2020-10-28 15:42:08 +0100	[diff] [blame]	207	assert tens.purpose in [TensorPurpose.FeatureMap, TensorPurpose.FSBias]
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	208	assert tens.format == TensorFormat.NHWC
				209	# the connected operator should expect a bias input unless it is a FullyConnected
Louis Verhaard	aee5d75	2020-09-30 09:01:52 +0200	[diff] [blame]	210	assert tens.consumer_list[0].type.needs_bias()
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	211	# the input bias tensor is the same as that connected to the operator
Louis Verhaard	aee5d75	2020-09-30 09:01:52 +0200	[diff] [blame]	212	bias_tens = tens.consumer_list[0].bias
Jacob Bohlin	cf7da10	2020-05-20 09:03:40 +0200	[diff] [blame]	213	assert tens is bias_tens
				214
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	215	# the operator should only have a single output
				216	assert len(tens.consumer_list[0].outputs) == 1
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	217	biases = tens.values
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	218
				219	first_consumer_op = tens.consumer_list[0]
				220	ifm_dtype = first_consumer_op.inputs[0].dtype
Dwight Lidman	4f728c0	2020-12-17 15:14:45 +0100	[diff] [blame]	221	ifm_scale = first_consumer_op.get_input_quantization().scale_f32
Louis Verhaard	98a3499	2020-09-01 10:39:04 +0200	[diff] [blame]	222	ofm_scale = first_consumer_op.get_output_quantization().scale_f32
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	223	weight_scales = first_consumer_op.inputs[1].quantization.scale_f32
				224
				225	# biases can have multiple consumers for rnn cells. if so, then check that they are all the same
				226	for op in tens.consumer_list[1:]:
Dwight Lidman	4f728c0	2020-12-17 15:14:45 +0100	[diff] [blame]	227	assert ifm_scale == op.get_input_quantization().scale_f32
Louis Verhaard	98a3499	2020-09-01 10:39:04 +0200	[diff] [blame]	228	assert ofm_scale == op.get_output_quantization().scale_f32
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	229	assert weight_scales == op.inputs[1].quantization.scale_f32
				230
				231	if not hasattr(weight_scales, "__iter__"):
				232	# If weight_scales is not already an iterable make it into a list
				233	weight_scales = [weight_scales]
				234
				235	# Convert scales to np.double (from np.float32) to conform to TensorFlow Lite which
				236	# uses double during scaling calculations
				237	# TensorFlow Lite casts the scales slightly differently for uint8 and int8
				238	if not rescale_for_faf:
				239	if ifm_dtype == DataType.uint8:
Dwight Lidman	4f728c0	2020-12-17 15:14:45 +0100	[diff] [blame]	240	# for some cases of the Mean operator, the scale must be calculated differently to match reference
				241	if first_consumer_op.low_precision_scaling:
				242	scales = [
				243	np.double(np.single(ifm_scale) / (np.single(weight_scale) * np.single(ofm_scale)))
				244	for weight_scale in weight_scales
				245	]
				246	else:
				247	scales = [np.double(ifm_scale * weight_scale) / np.double(ofm_scale) for weight_scale in weight_scales]
Fredrik Svedberg	d67c0aa	2020-03-30 13:15:28 +0200	[diff] [blame]	248	elif ifm_dtype == DataType.int8 or ifm_dtype == DataType.int16:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	249	scales = [
				250	(np.double(ifm_scale) * np.double(weight_scale)) / np.double(ofm_scale)
				251	for weight_scale in weight_scales
				252	]
				253	else:
Michael McGeagh	7a6f843	2020-12-02 15:29:22 +0000	[diff] [blame]	254	raise UnsupportedFeatureError(f"Compression of {ifm_dtype} is not implemented; Tensor: '{tens.name}'")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	255	else:
				256	if ifm_dtype == DataType.uint8:
				257	scales = [np.double(ifm_scale * weight_scale * 0x3000) for weight_scale in weight_scales]
Fredrik Svedberg	d67c0aa	2020-03-30 13:15:28 +0200	[diff] [blame]	258	elif ifm_dtype == DataType.int8 or ifm_dtype == DataType.int16:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	259	scales = [(np.double(ifm_scale * 0x3000) * np.double(weight_scale)) for weight_scale in weight_scales]
				260	else:
Michael McGeagh	7a6f843	2020-12-02 15:29:22 +0000	[diff] [blame]	261	raise UnsupportedFeatureError(f"Compression of {ifm_dtype} is not implemented; Tensor: '{tens.name}'")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	262
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	263	if explicit_scaling:
				264	assert len(explicit_scaling.shift) == len(explicit_scaling.multiplier)
				265	quantised_scales = [(int(m), int(s)) for s, m in zip(explicit_scaling.shift, explicit_scaling.multiplier)]
Fredrik Svedberg	d67c0aa	2020-03-30 13:15:28 +0200	[diff] [blame]	266	else:
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	267	# quantise all of the weight scales into (scale_factor, shift)
				268	if ifm_dtype == DataType.int16:
				269	quantised_scales = [reduced_quantise_scale(scale) for scale in scales]
				270	else:
				271	quantised_scales = [quantise_scale(scale) for scale in scales]
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	272
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	273	# If only 1 quantised scale is used, repeat that value for the length of the biases
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	274	if len(quantised_scales) == 1:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	275	quantised_scales = [quantised_scales[0]] * len(biases)
				276
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	277	return quantised_scales, biases
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	278
Jacob Bohlin	e843d33	2020-06-23 12:12:56 +0200	[diff] [blame]	279
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	280	def encode_weight_and_scale_tensor(
				281	arch, op, weight_tens, scale_tens, kernel, block_config, depth_offsets, rescale_for_faf=False
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	282	) -> (NpuWeightTensor, NpuWeightTensor):
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	283	npu_block_type = op.type.npu_block_type
				284
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	285	ifm_scale = scale_tens and scale_tens.consumer_list[0].get_input_quantization().scale_f32
				286	ofm_scale = scale_tens and scale_tens.consumer_list[0].get_output_quantization().scale_f32
				287
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	288	wcc = create_weight_compression_config(
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	289	weight_tens, npu_block_type, block_config.ofm_block.depth, hash(str(depth_offsets)), kernel.dilation
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	290	)
				291
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	292	scc = ScaleCompressionConfig(scale_tens and scale_tens.value_id, ifm_scale, ofm_scale)
				293
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	294	tens_cached = CompressedWeightCache.get_tensor_with_same_compression(wcc)
				295	if tens_cached is not None:
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	296	if tens_cached.scale_compression_config == scc:
				297	return tens_cached, None
				298	npu_tensor = NpuWeightTensor(scale_tens.name)
				299	do_weights = False
				300	do_scales = True
				301	else:
				302	npu_tensor = NpuWeightTensor(weight_tens.name)
				303	do_weights = True
				304	do_scales = True
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	305
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	306	npu_tensor.weight_compression_config = wcc
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	307	npu_tensor.scale_compression_config = scc
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	308
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	309	# Ensure depth offsets are terminated at end of OFM shape
				310	assert len(depth_offsets) > 1, "Require closed depth ranges"
				311
				312	ifm_bitdepth = op.inputs[0].dtype.size_in_bits()
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	313
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	314	# No cache hit, need to perform the encoding
				315	if do_weights:
				316	assert weight_tens.quantization is not None
Patrik Gustavsson	b081d67	2021-08-25 13:49:25 +0200	[diff] [blame^]	317	assert weight_tens.quantization.scale_f32 is not None or op.explicit_scaling
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	318	assert weight_tens.quantization.zero_point is not None
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	319
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	320	# Early zero-point correction
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	321	quant_buf = weight_tens.values.astype(np.int16)
Tim Hall	b279844	2021-06-24 19:31:38 +0100	[diff] [blame]	322	# the zero point can be either a native or numpy type
				323	if isinstance(weight_tens.quantization.zero_point, (int, float)):
				324	zero_point = np.int16(weight_tens.quantization.zero_point)
				325	else:
				326	zero_point = weight_tens.quantization.zero_point.astype(np.int16)
				327	weights = quant_buf - zero_point
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	328
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	329	if len(weights.shape) == 2:
				330	weights = np.expand_dims(np.expand_dims(weights, axis=0), axis=0)
				331
				332	# Expect this (undilated) equivalence
				333	assert kernel.height == weights.shape[0]
				334	assert kernel.width == weights.shape[1]
				335
				336	ifm_depth = weights.shape[-2]
				337
				338	# Default HW traversal
				339	npu_tensor.hw_traversal = NpuBlockTraversal.DEPTH_FIRST
				340
				341	if npu_block_type == NpuBlockType.ConvolutionMxN:
				342	# Determine which block traversal strategy has better DPU utilization
				343	kernel_size = weights.shape[0] * weights.shape[1]
				344	depth_utilization = weights.shape[2] / round_up(weights.shape[2], 32 if ifm_bitdepth == 8 else 16)
				345	part_kernel_utilization = (weights.shape[2] / round_up(weights.shape[2], 8)) * (
				346	kernel_size / round_up(kernel_size, 4 if ifm_bitdepth == 8 else 2)
				347	)
				348	if part_kernel_utilization >= depth_utilization or ifm_depth <= 8:
				349	# Part-kernel first is always better for ifm depths <= 8
				350	npu_tensor.hw_traversal = NpuBlockTraversal.PART_KERNEL_FIRST
				351
				352	if op.type == Op.Conv2DBackpropInputSwitchedBias:
				353	# Transpose Convoluion, reverse weights in H and W axes
				354	weights = np.flip(weights, axis=(0, 1))
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	355
				356	encoded_stream = bytearray()
				357	max_single_buffer_len = 0
				358	is_depthwise = npu_block_type == NpuBlockType.ConvolutionDepthWise
				359
				360	# Bias & scale
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	361	if do_scales:
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	362	quantised_scales, biases = _prepare_scale_and_bias(arch, scale_tens, rescale_for_faf, op.explicit_scaling)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	363	scale_tens.element_size_bytes = 10
				364
				365	# Slice the weight stream up depth-ways into bricks and compress
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	366	full_ofm_depth = weight_tens.values.shape[-1]
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	367	ofm_block_depth = block_config.ofm_block.depth
				368
				369	weight_range_index = 0
				370	for idx, depth_offset in enumerate(depth_offsets[:-1]):
				371	# Do not generate for offsets outside the OFM
				372	assert depth_offset >= 0 and depth_offset < full_ofm_depth
				373	depth_length = depth_offsets[idx + 1] - depth_offset
				374
				375	# Get the weights necessary for this brick
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	376	if do_weights:
				377	brick_weights = weights[:, :, :, depth_offset : depth_offset + depth_length]
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	378
				379	buffer_start_offset = len(encoded_stream)
				380
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	381	# For each core, deinterleave weights/scales from the larger volume
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	382	# and generate separate compressed streams.
				383	for core in range(0, min(arch.ncores, full_ofm_depth)):
				384
				385	core_block_depth = int((ofm_block_depth + arch.ncores - 1 - core) // arch.ncores)
				386
				387	if core_block_depth != 0:
				388	key = WeightKey(core, depth_offset)
				389	weight_range = WeightRange()
				390	weight_range.offset = len(encoded_stream)
				391	weight_range.index = weight_range_index
				392	weight_range_index += 1
				393
				394	# Scales & biases
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	395	if do_scales:
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	396	scale_stream = []
				397	core_scales = quantised_scales[
				398	depth_offset + core : depth_offset + core + depth_length : arch.ncores
				399	]
				400	core_biases = biases[depth_offset + core : depth_offset + core + depth_length : arch.ncores]
				401	for j, core_bias in enumerate(core_biases):
				402	scale_stream.extend(encode_bias(np.int64(core_bias), *core_scales[j]))
				403
				404	weight_range.scale_bytes = len(scale_stream)
				405
				406	encoded_stream.extend(scale_stream)
				407
				408	# Align to 16 for start of next substream
				409	remainder = len(encoded_stream) % 16
				410	if remainder > 0:
				411	encoded_stream.extend(bytearray(16 - remainder))
				412
				413	# Weights
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	414	if do_weights:
				415	core_weights = core_deinterleave(brick_weights, core, arch.ncores)
				416	encoded_substream, _ = encode_weights(
				417	accelerator=arch.accelerator_config,
				418	weights_volume=core_weights,
				419	dilation_xy=kernel.dilation,
				420	ifm_bitdepth=ifm_bitdepth,
				421	ofm_block_depth=core_block_depth,
				422	is_depthwise=is_depthwise,
				423	block_traversal=npu_tensor.hw_traversal,
				424	)
				425	weight_range.weight_offset = len(encoded_stream) - weight_range.offset
				426	weight_range.weight_bytes = len(encoded_substream)
				427	# Append encoded section
				428	encoded_stream.extend(encoded_substream)
				429	assert len(encoded_stream) % 16 == 0
Diqing Zhong	66d7ec0	2021-02-01 19:07:04 +0100	[diff] [blame]	430
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	431	# Record encoded range in tensor
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	432	npu_tensor.encoded_ranges[key] = weight_range
				433
				434	# Remember maximum encoded length for DoubleBuffering
				435	max_single_buffer_len = max(max_single_buffer_len, len(encoded_stream) - buffer_start_offset)
				436
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	437	# Attach buffer to tensor
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	438	npu_tensor.buffer = encoded_stream
				439	npu_tensor.max_range_bytes = max_single_buffer_len
				440	npu_tensor.set_all_shapes([1, 1, 1, len(encoded_stream)])
				441	npu_tensor.format = TensorFormat.WeightsCompressed
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	442
				443	# Scale only tensor
				444	if not do_weights:
				445	npu_tensor.weight_compression_config = None
				446	npu_tensor.purpose = TensorPurpose.FSBias
				447	npu_tensor.mem_area = scale_tens.mem_area
				448	npu_tensor.mem_type = scale_tens.mem_type
				449	weights_tensor = tens_cached
				450	scale_tensor = npu_tensor
				451	else:
				452	npu_tensor.purpose = TensorPurpose.Weights
				453	npu_tensor.mem_area = weight_tens.mem_area
				454	npu_tensor.mem_type = weight_tens.mem_type
				455	weights_tensor = npu_tensor
				456	scale_tensor = None
				457	CompressedWeightCache.add(weights_tensor)
				458
				459	return weights_tensor, scale_tensor