ethosu/vela/lstm.py - ml/ethos-u/ethos-u-vela - Gitiles

 # SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the License); you may
 # not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an AS IS BASIS, WITHOUT
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 # Description:
 # Contains implementation of UnidirectionalSequenceLstm graph optimisation.
 from enum import Enum
 from typing import Tuple

 import numpy as np

 from .data_type import DataType
 from .debug_database import DebugDatabase
 from .graph_optimiser_util import create_avg_pool_for_concat
 from .operation import ActivationFunction
 from .operation import ExplicitScaling
 from .operation import Op
 from .operation import Operation
 from .operation_util import create_add
 from .operation_util import create_fullyconnected
 from .operation_util import create_fused_activation
 from .operation_util import create_mul
 from .scaling import elementwise_mul_scale
 from .shape4d import Shape4D
 from .tensor import QuantizationParameters
 from .tensor import Tensor

 Q0_15_SCALE = np.float32(2**-15)
 """Q0.15 scale like the reference defines it"""


 class Lstm:
     """Lstm graph optimisation.

     Unrolls a UNIDIRECTIONAL_SEQUENCE_LSTM operation into its basic operations.

     Usage:

     unrolled_op = Lstm(op).get_graph()
     """

     class State(Enum):
         """States (variable tensors)"""

         OUTPUT = 18  # Value = tensor index
         CELL = 19  # Value = tensor index

     def __init__(self, op):
         self.op = op

     def get_graph(self) -> Operation:
         """Return the generated graph implementation"""
         self.op.ofm.ops = []
         if self.time_major:
             output_state = self.get_initial_state(Lstm.State.OUTPUT)
             cell_state = self.get_initial_state(Lstm.State.CELL)
             for time in range(self.n_time):
                 feature = self.get_feature(time)
                 output_state, cell_state = self.lstm_step(feature, output_state, cell_state, time)
                 op = self.put_ofm(output_state, time)
         else:
             for batch in range(self.n_batch):
                 output_state = self.get_initial_state(Lstm.State.OUTPUT, batch)
                 cell_state = self.get_initial_state(Lstm.State.CELL, batch)
                 for time in range(self.n_time):
                     feature = self.get_feature(time, batch)
                     output_state, cell_state = self.lstm_step(feature, output_state, cell_state, time, batch)
                     op = self.put_ofm(output_state, time, batch)
         return op

     def get_feature(self, time: int, batch: int = 0) -> Tensor:
         """Get input feature for provided time and batch"""
         feature = self.op.ifm.clone(f"_feature#{batch}.{time}")
         feature.set_all_shapes([self.n_batch if self.time_major else 1, self.n_feature])
         op = Operation(Op.SplitSliceRead, feature.name)
         op.add_input_tensor(self.op.ifm)
         op.set_output_tensor(feature)
         op.set_ifm_ofm_shapes()
         offset = [time, 0, 0] if self.time_major else [batch, time, 0]
         op.read_offsets[0] = Shape4D.from_list(offset, 0)
         op.read_shapes[0] = op.ofm_shapes[0]
         DebugDatabase.add_optimised(self.op, op)
         return feature

     def get_initial_state(self, state_type: State, batch: int = 0) -> Tensor:
         """Get state tensor for provided state type and batch"""
         state = self.state(state_type)
         if self.time_major:
             # For time major just return the 2D state, since all batches
             # are calculated at the same time
             return state
         else:
             # For non time major return one batch of the 2D state
             # by setting the read offset to the provided batch

             # The cloned state tensor will share equivalence id and buffer
             # with the variable state tensor
             n_state = state.shape[-1]
             state_ofm = state.clone(f"_state#{batch}")
             # Set shape to be one batch
             state_ofm.set_all_shapes([1, n_state])
             # Create the op for reading one batch of the state
             # (will be optimised away at a later stage)
             op = Operation(Op.SplitSliceRead, state_ofm.name)
             op.add_input_tensor(state)
             op.set_output_tensor(state_ofm)
             op.set_ifm_ofm_shapes()
             # Set the read offset to the provided batch
             op.read_offsets[0] = Shape4D.from_list([batch, 0], 0)
             # Set the read shape to one batch, see above
             op.read_shapes[0] = op.ofm_shapes[0]
             DebugDatabase.add_optimised(self.op, op)
             return state_ofm

     def get_state(self, op: Operation, batch: int = 0) -> Operation:
         """Setup the correct read offset for reading the state from
         a variable tensor state"""
         if not self.time_major and self.n_batch > 1:
             op.read_offsets[0] = Shape4D.from_list([batch, 0], 0)
             op.read_shapes[0] = Shape4D(op.ifm.shape)
             op.ifm_shapes[0] = Shape4D([self.n_batch, op.ifm.shape[-1]])
         return op

     def put_state(self, op: Operation, state_type: State, batch: int = 0) -> Operation:
         """Save the state for the provided batch by pointing the operations
         ofm to the variable state tensor"""
         # The create op functions always return 4D shape, however the state
         # should have 2D shape for correct operation
         op.ofm.shape = op.ofm.shape[-2:]
         # Get state from type
         state = self.state(state_type)
         # By using the same equivalence_id the backing buffer for the ofm
         # tensor will be the state variable tensor buffer
         op.ofm.equivalence_id = state.equivalence_id
         # Set memory function which will make the tensor be in linear format
         # just as the state variable tensor
         op.memory_function = Op.VariableTensorWrite
         # Set the batch write offset into the state tensor buffer unless
         # time_major mode when all batches are written at once
         if not self.time_major:
             op.write_offset = Shape4D.from_list([batch, 0], 0)
             op.write_shape = Shape4D(op.ofm.shape)
             op.ofm_shapes = [Shape4D(state.shape)]
         DebugDatabase.add_optimised(self.op, op)
         return op

     def put_ofm(self, state: Tensor, time: int, batch: int = 0) -> Operation:
         """Save the output state for the provided batch and time to OFM"""
         name = f"{self.op.ofm.name}#{batch}.{time}"
         offset = Shape4D.from_list([time, 0, 0] if self.time_major else [batch, time, 0], 0)
         op = create_avg_pool_for_concat(self.op, name, state, Shape4D(state.shape), offset)
         # The provided state tensor use the output state tensors buffer, so unless
         # time_major mode we need to set the correct batch read offset
         if not self.time_major:
             op.read_offsets[0] = Shape4D.from_list([batch, 0], 0)
             op.read_shapes[0] = Shape4D(state.shape)
             op.ifm_shapes[0] = Shape4D(self.output_state.shape)
         return op

     def lstm_step(
         self, feature: Tensor, output_state: Tensor, cell_state: Tensor, time: int, batch: int = 0
     ) -> Tuple[Tensor, Tensor]:
         """Generate one step of the LSTM implementation for the provided feature, batch and time"""
         input_gate = self.calculate_gate(
             f"input_gate#{batch}.{time}",
             feature,
             output_state,
             self.input_to_input_weights,
             self.input_bias,
             self.recurrent_to_input_weights,
             None,
             Op.Sigmoid,
             batch,
         )
         forget_gate = self.calculate_gate(
             f"forget_gate#{batch}.{time}",
             feature,
             output_state,
             self.input_to_forget_weights,
             self.forget_bias,
             self.recurrent_to_forget_weights,
             None,
             Op.Sigmoid,
             batch,
         )
         cell_gate = self.calculate_gate(
             f"cell_gate#{batch}.{time}",
             feature,
             output_state,
             self.input_to_cell_weights,
             self.cell_bias,
             self.recurrent_to_cell_weights,
             None,
             Op.Tanh,
             batch,
         )
         cell_state = self.calculate_cell_state(cell_state, input_gate, forget_gate, cell_gate, time, batch)
         output_gate = self.calculate_gate(
             f"output_gate#{batch}.{time}",
             feature,
             output_state,
             self.input_to_output_weights,
             self.output_bias,
             self.recurrent_to_output_weights,
             None,
             Op.Sigmoid,
             batch,
         )
         output_state = self.calculate_output_state(output_gate, cell_state, time, batch)
         return (output_state, cell_state)

     def calculate_gate(
         self,
         name: str,
         input: Tensor,
         state: Tensor,
         input_weights: Tensor,
         input_bias: Tensor,
         recurrent_weights: Tensor,
         recurrent_bias: Tensor,
         activation: Op,
         batch: int = 0,
     ):
         """Generate a gate for the provided input and weights"""
         # Activation( Add( FC(input), FC(output state) ) )
         # Setup fullyconnected quantization
         q_fc = QuantizationParameters()
         q_fc.scale_f32 = np.float32(2**-12)
         q_fc.zero_point = 0
         # Create fullyconnected
         in_fc = create_fullyconnected(f"{name}:{input.name}_fc", input, input_weights, input_bias, q_fc, False)
         re_fc = create_fullyconnected(f"{name}:{state.name}_fc", state, recurrent_weights, recurrent_bias, q_fc, False)
         self.get_state(re_fc, batch)
         # Change fullyconnected ofm data type
         in_fc.ofm.dtype = DataType.int16
         re_fc.ofm.dtype = DataType.int16
         # Setup add quantization
         q_add = q_fc.clone()
         q_add.scale_f32 = Q0_15_SCALE
         # Create add + activation
         add = create_add(f"{name}_add", in_fc.ofm, re_fc.ofm, q_add, ActivationFunction(activation))
         if activation is Op.Sigmoid:
             # For Sigmoid we need to set the activation min/max values to match the possible range
             # in the reference. The values below are the quantized min/max values that the reference
             # can achive for the LUT based Sigmoid/Logistic. (The NPU does however have a larger range
             # due to intermediate higher precision.)
             # The quantized min/max values are divided by the effective output scale 0x3000 (3<<12) used for
             # elementwise operations with fused Tanh/Sigmoid activations (to get correct scaling before the
             # fused activation function). This will yield the dequantized min/max values which are later
             # quantized again by the command stream generator.
             add.activation.max = 32757 / 0x3000
             add.activation.min = 11 / 0x3000
         # Add to debug database
         DebugDatabase.add_optimised(self.op, in_fc)
         DebugDatabase.add_optimised(self.op, re_fc)
         DebugDatabase.add_optimised(self.op, add)
         return add.ofm

     def calculate_cell_state(
         self, cell_state: Tensor, input_gate: Tensor, forget_gate: Tensor, cell_gate: Tensor, time: int, batch: int = 0
     ):
         """Update the cell state from the provided gate output"""
         # Clip( Add( Mul(cell state, forget gate), Mul(input gate, cell gate) ) )
         base_name = f"cell_state#{batch}.{time}"
         # Cell scale
         cell_scale = cell_state.quantization.scale_f32
         # Create mul(cell_state, forget_gate)
         mul_cf = create_mul(f"{base_name}_cf_mul", cell_state, forget_gate, cell_state.quantization)
         self.get_state(mul_cf, batch)
         # Calculate explicit scales to match reference
         multiplier, shift = elementwise_mul_scale(np.double(cell_scale), np.double(Q0_15_SCALE), np.double(cell_scale))
         mul_cf.explicit_scaling = ExplicitScaling(False, [shift], [multiplier])
         # Create mul(cell_gate, input_gate)
         mul_ci = create_mul(f"{base_name}_ci_mul", cell_gate, input_gate, cell_state.quantization)
         # Calculate explicit scales to match reference
         multiplier, shift = elementwise_mul_scale(np.double(Q0_15_SCALE), np.double(Q0_15_SCALE), np.double(cell_scale))
         mul_ci.explicit_scaling = ExplicitScaling(False, [shift], [multiplier])
         # Setup cell clip
         activation = None if self.cell_clip == 0 else ActivationFunction(Op.Clip)
         if activation:
             activation.max = self.cell_clip
             activation.min = -self.cell_clip
         # Create add + activation
         add = create_add(f"{base_name}_add", mul_cf.ofm, mul_ci.ofm, cell_state.quantization, activation)
         add.explicit_scaling = ExplicitScaling(False, shift=[0], multiplier=[1])
         # Save new state
         self.put_state(add, Lstm.State.CELL, batch)
         # Add to debug database
         DebugDatabase.add_optimised(self.op, mul_cf)
         DebugDatabase.add_optimised(self.op, mul_ci)
         DebugDatabase.add_optimised(self.op, add)
         return add.ofm

     def calculate_output_state(self, output_gate: Tensor, cell_state: Tensor, time: int, batch: int):
         """Generate the output state from the provided gate output"""
         # Mul( Tanh(cell state), output gate )
         base_name = f"output_state#{batch}.{time}"
         # Setup tanh quantization
         q_out_tanh = QuantizationParameters()
         q_out_tanh.scale_f32 = Q0_15_SCALE
         q_out_tanh.zero_point = 0
         # Create tanh(cell state)
         tanh = create_fused_activation(Op.Tanh, f"{base_name}_tanh", cell_state, q_out_tanh)
         self.get_state(tanh, batch)
         # Create Mul( Tanh(cell state), output gate )
         q_mul = self.output_state.quantization
         mul = create_mul(f"{base_name}_mul", tanh.ofm, output_gate, q_mul, dtype=self.op.ifm.dtype)
         # Use explicit scaling to match reference, the following line would have been the preferred way
         # mul.forced_output_quantization = self.hidden_quantization
         out_scale = self.hidden_quantization.scale_f32
         multiplier, shift = elementwise_mul_scale(np.double(Q0_15_SCALE), np.double(Q0_15_SCALE), np.double(out_scale))
         mul.explicit_scaling = ExplicitScaling(False, [shift], [multiplier])
         # Save new state
         self.put_state(mul, Lstm.State.OUTPUT, batch)
         # Add to debug database
         DebugDatabase.add_optimised(self.op, tanh)
         DebugDatabase.add_optimised(self.op, mul)
         return mul.ofm

     def state(self, state_type: State) -> Tensor:
         """Get state tensor from type"""
         return self.output_state if state_type == Lstm.State.OUTPUT else self.cell_state

     # Dimensions
     @property
     def n_feature(self) -> int:
         return self.op.ifm.shape[-1]

     @property
     def n_time(self) -> int:
         return self.op.ifm.shape[0 if self.time_major else 1]

     @property
     def n_batch(self) -> int:
         return self.op.ifm.shape[1 if self.time_major else 0]

     # Attributes
     @property
     def cell_clip(self) -> int:
         return self.op.attrs.get("cell_clip", 0)

     @property
     def projection_clip(self) -> int:
         return self.op.attrs.get("proj_clip", 0)

     @property
     def time_major(self) -> bool:
         return self.op.attrs.get("time_major", False)

     # Hidden (intermediate)
     @property
     def hidden_quantization(self) -> QuantizationParameters:
         return self.op.intermediates[4].quantization

     # Input weights
     @property
     def input_to_input_weights(self) -> Tensor:
         return self.op.inputs[1]

     @property
     def input_to_forget_weights(self) -> Tensor:
         return self.op.inputs[2]

     @property
     def input_to_cell_weights(self) -> Tensor:
         return self.op.inputs[3]

     @property
     def input_to_output_weights(self) -> Tensor:
         return self.op.inputs[4]

     # Recurrent weights
     @property
     def recurrent_to_input_weights(self) -> Tensor:
         return self.op.inputs[5]

     @property
     def recurrent_to_forget_weights(self) -> Tensor:
         return self.op.inputs[6]

     @property
     def recurrent_to_cell_weights(self) -> Tensor:
         return self.op.inputs[7]

     @property
     def recurrent_to_output_weights(self) -> Tensor:
         return self.op.inputs[8]

     # Peephole weights
     @property
     def cell_to_input_weights(self) -> Tensor:
         return self.op.inputs[9]

     @property
     def cell_to_forget_weights(self) -> Tensor:
         return self.op.inputs[10]

     @property
     def cell_to_output_weights(self) -> Tensor:
         return self.op.inputs[11]

     # Bias tensors
     @property
     def input_bias(self) -> Tensor:
         return self.op.inputs[12]

     @property
     def forget_bias(self) -> Tensor:
         return self.op.inputs[13]

     @property
     def cell_bias(self) -> Tensor:
         return self.op.inputs[14]

     @property
     def output_bias(self) -> Tensor:
         return self.op.inputs[15]

     # Projection tensors
     @property
     def projection_weights(self) -> Tensor:
         return self.op.inputs[16]

     @property
     def projection_bias(self) -> Tensor:
         return self.op.inputs[17]

     # State tensors (variable)
     @property
     def output_state(self) -> Tensor:
         return self.op.inputs[Lstm.State.OUTPUT.value]

     @property
     def cell_state(self) -> Tensor:
         return self.op.inputs[Lstm.State.CELL.value]
	# SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
	#
	# SPDX-License-Identifier: Apache-2.0
	#
	# Licensed under the Apache License, Version 2.0 (the License); you may
	# not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#
	# Description:
	# Contains implementation of UnidirectionalSequenceLstm graph optimisation.
	from enum import Enum
	from typing import Tuple

	import numpy as np

	from .data_type import DataType
	from .debug_database import DebugDatabase
	from .graph_optimiser_util import create_avg_pool_for_concat
	from .operation import ActivationFunction
	from .operation import ExplicitScaling
	from .operation import Op
	from .operation import Operation
	from .operation_util import create_add
	from .operation_util import create_fullyconnected
	from .operation_util import create_fused_activation
	from .operation_util import create_mul
	from .scaling import elementwise_mul_scale
	from .shape4d import Shape4D
	from .tensor import QuantizationParameters
	from .tensor import Tensor

	Q0_15_SCALE = np.float32(2**-15)
	"""Q0.15 scale like the reference defines it"""


	class Lstm:
	"""Lstm graph optimisation.

	Unrolls a UNIDIRECTIONAL_SEQUENCE_LSTM operation into its basic operations.

	Usage:

	unrolled_op = Lstm(op).get_graph()
	"""

	class State(Enum):
	"""States (variable tensors)"""

	OUTPUT = 18 # Value = tensor index
	CELL = 19 # Value = tensor index

	def __init__(self, op):
	self.op = op

	def get_graph(self) -> Operation:
	"""Return the generated graph implementation"""
	self.op.ofm.ops = []
	if self.time_major:
	output_state = self.get_initial_state(Lstm.State.OUTPUT)
	cell_state = self.get_initial_state(Lstm.State.CELL)
	for time in range(self.n_time):
	feature = self.get_feature(time)
	output_state, cell_state = self.lstm_step(feature, output_state, cell_state, time)
	op = self.put_ofm(output_state, time)
	else:
	for batch in range(self.n_batch):
	output_state = self.get_initial_state(Lstm.State.OUTPUT, batch)
	cell_state = self.get_initial_state(Lstm.State.CELL, batch)
	for time in range(self.n_time):
	feature = self.get_feature(time, batch)
	output_state, cell_state = self.lstm_step(feature, output_state, cell_state, time, batch)
	op = self.put_ofm(output_state, time, batch)
	return op

	def get_feature(self, time: int, batch: int = 0) -> Tensor:
	"""Get input feature for provided time and batch"""
	feature = self.op.ifm.clone(f"_feature#{batch}.{time}")
	feature.set_all_shapes([self.n_batch if self.time_major else 1, self.n_feature])
	op = Operation(Op.SplitSliceRead, feature.name)
	op.add_input_tensor(self.op.ifm)
	op.set_output_tensor(feature)
	op.set_ifm_ofm_shapes()
	offset = [time, 0, 0] if self.time_major else [batch, time, 0]
	op.read_offsets[0] = Shape4D.from_list(offset, 0)
	op.read_shapes[0] = op.ofm_shapes[0]
	DebugDatabase.add_optimised(self.op, op)
	return feature

	def get_initial_state(self, state_type: State, batch: int = 0) -> Tensor:
	"""Get state tensor for provided state type and batch"""
	state = self.state(state_type)
	if self.time_major:
	# For time major just return the 2D state, since all batches
	# are calculated at the same time
	return state
	else:
	# For non time major return one batch of the 2D state
	# by setting the read offset to the provided batch

	# The cloned state tensor will share equivalence id and buffer
	# with the variable state tensor
	n_state = state.shape[-1]
	state_ofm = state.clone(f"_state#{batch}")
	# Set shape to be one batch
	state_ofm.set_all_shapes([1, n_state])
	# Create the op for reading one batch of the state
	# (will be optimised away at a later stage)
	op = Operation(Op.SplitSliceRead, state_ofm.name)
	op.add_input_tensor(state)
	op.set_output_tensor(state_ofm)
	op.set_ifm_ofm_shapes()
	# Set the read offset to the provided batch
	op.read_offsets[0] = Shape4D.from_list([batch, 0], 0)
	# Set the read shape to one batch, see above
	op.read_shapes[0] = op.ofm_shapes[0]
	DebugDatabase.add_optimised(self.op, op)
	return state_ofm

	def get_state(self, op: Operation, batch: int = 0) -> Operation:
	"""Setup the correct read offset for reading the state from
	a variable tensor state"""
	if not self.time_major and self.n_batch > 1:
	op.read_offsets[0] = Shape4D.from_list([batch, 0], 0)
	op.read_shapes[0] = Shape4D(op.ifm.shape)
	op.ifm_shapes[0] = Shape4D([self.n_batch, op.ifm.shape[-1]])
	return op

	def put_state(self, op: Operation, state_type: State, batch: int = 0) -> Operation:
	"""Save the state for the provided batch by pointing the operations
	ofm to the variable state tensor"""
	# The create op functions always return 4D shape, however the state
	# should have 2D shape for correct operation
	op.ofm.shape = op.ofm.shape[-2:]
	# Get state from type
	state = self.state(state_type)
	# By using the same equivalence_id the backing buffer for the ofm
	# tensor will be the state variable tensor buffer
	op.ofm.equivalence_id = state.equivalence_id
	# Set memory function which will make the tensor be in linear format
	# just as the state variable tensor
	op.memory_function = Op.VariableTensorWrite
	# Set the batch write offset into the state tensor buffer unless
	# time_major mode when all batches are written at once
	if not self.time_major:
	op.write_offset = Shape4D.from_list([batch, 0], 0)
	op.write_shape = Shape4D(op.ofm.shape)
	op.ofm_shapes = [Shape4D(state.shape)]
	DebugDatabase.add_optimised(self.op, op)
	return op

	def put_ofm(self, state: Tensor, time: int, batch: int = 0) -> Operation:
	"""Save the output state for the provided batch and time to OFM"""
	name = f"{self.op.ofm.name}#{batch}.{time}"
	offset = Shape4D.from_list([time, 0, 0] if self.time_major else [batch, time, 0], 0)
	op = create_avg_pool_for_concat(self.op, name, state, Shape4D(state.shape), offset)
	# The provided state tensor use the output state tensors buffer, so unless
	# time_major mode we need to set the correct batch read offset
	if not self.time_major:
	op.read_offsets[0] = Shape4D.from_list([batch, 0], 0)
	op.read_shapes[0] = Shape4D(state.shape)
	op.ifm_shapes[0] = Shape4D(self.output_state.shape)
	return op

	def lstm_step(
	self, feature: Tensor, output_state: Tensor, cell_state: Tensor, time: int, batch: int = 0
	) -> Tuple[Tensor, Tensor]:
	"""Generate one step of the LSTM implementation for the provided feature, batch and time"""
	input_gate = self.calculate_gate(
	f"input_gate#{batch}.{time}",
	feature,
	output_state,
	self.input_to_input_weights,
	self.input_bias,
	self.recurrent_to_input_weights,
	None,
	Op.Sigmoid,
	batch,
	)
	forget_gate = self.calculate_gate(
	f"forget_gate#{batch}.{time}",
	feature,
	output_state,
	self.input_to_forget_weights,
	self.forget_bias,
	self.recurrent_to_forget_weights,
	None,
	Op.Sigmoid,
	batch,
	)
	cell_gate = self.calculate_gate(
	f"cell_gate#{batch}.{time}",
	feature,
	output_state,
	self.input_to_cell_weights,
	self.cell_bias,
	self.recurrent_to_cell_weights,
	None,
	Op.Tanh,
	batch,
	)
	cell_state = self.calculate_cell_state(cell_state, input_gate, forget_gate, cell_gate, time, batch)
	output_gate = self.calculate_gate(
	f"output_gate#{batch}.{time}",
	feature,
	output_state,
	self.input_to_output_weights,
	self.output_bias,
	self.recurrent_to_output_weights,
	None,
	Op.Sigmoid,
	batch,
	)
	output_state = self.calculate_output_state(output_gate, cell_state, time, batch)
	return (output_state, cell_state)

	def calculate_gate(
	self,
	name: str,
	input: Tensor,
	state: Tensor,
	input_weights: Tensor,
	input_bias: Tensor,
	recurrent_weights: Tensor,
	recurrent_bias: Tensor,
	activation: Op,
	batch: int = 0,
	):
	"""Generate a gate for the provided input and weights"""
	# Activation( Add( FC(input), FC(output state) ) )
	# Setup fullyconnected quantization
	q_fc = QuantizationParameters()
	q_fc.scale_f32 = np.float32(2**-12)
	q_fc.zero_point = 0
	# Create fullyconnected
	in_fc = create_fullyconnected(f"{name}:{input.name}_fc", input, input_weights, input_bias, q_fc, False)
	re_fc = create_fullyconnected(f"{name}:{state.name}_fc", state, recurrent_weights, recurrent_bias, q_fc, False)
	self.get_state(re_fc, batch)
	# Change fullyconnected ofm data type
	in_fc.ofm.dtype = DataType.int16
	re_fc.ofm.dtype = DataType.int16
	# Setup add quantization
	q_add = q_fc.clone()
	q_add.scale_f32 = Q0_15_SCALE
	# Create add + activation
	add = create_add(f"{name}_add", in_fc.ofm, re_fc.ofm, q_add, ActivationFunction(activation))
	if activation is Op.Sigmoid:
	# For Sigmoid we need to set the activation min/max values to match the possible range
	# in the reference. The values below are the quantized min/max values that the reference
	# can achive for the LUT based Sigmoid/Logistic. (The NPU does however have a larger range
	# due to intermediate higher precision.)
	# The quantized min/max values are divided by the effective output scale 0x3000 (3<<12) used for
	# elementwise operations with fused Tanh/Sigmoid activations (to get correct scaling before the
	# fused activation function). This will yield the dequantized min/max values which are later
	# quantized again by the command stream generator.
	add.activation.max = 32757 / 0x3000
	add.activation.min = 11 / 0x3000
	# Add to debug database
	DebugDatabase.add_optimised(self.op, in_fc)
	DebugDatabase.add_optimised(self.op, re_fc)
	DebugDatabase.add_optimised(self.op, add)
	return add.ofm

	def calculate_cell_state(
	self, cell_state: Tensor, input_gate: Tensor, forget_gate: Tensor, cell_gate: Tensor, time: int, batch: int = 0
	):
	"""Update the cell state from the provided gate output"""
	# Clip( Add( Mul(cell state, forget gate), Mul(input gate, cell gate) ) )
	base_name = f"cell_state#{batch}.{time}"
	# Cell scale
	cell_scale = cell_state.quantization.scale_f32
	# Create mul(cell_state, forget_gate)
	mul_cf = create_mul(f"{base_name}_cf_mul", cell_state, forget_gate, cell_state.quantization)
	self.get_state(mul_cf, batch)
	# Calculate explicit scales to match reference
	multiplier, shift = elementwise_mul_scale(np.double(cell_scale), np.double(Q0_15_SCALE), np.double(cell_scale))
	mul_cf.explicit_scaling = ExplicitScaling(False, [shift], [multiplier])
	# Create mul(cell_gate, input_gate)
	mul_ci = create_mul(f"{base_name}_ci_mul", cell_gate, input_gate, cell_state.quantization)
	# Calculate explicit scales to match reference
	multiplier, shift = elementwise_mul_scale(np.double(Q0_15_SCALE), np.double(Q0_15_SCALE), np.double(cell_scale))
	mul_ci.explicit_scaling = ExplicitScaling(False, [shift], [multiplier])
	# Setup cell clip
	activation = None if self.cell_clip == 0 else ActivationFunction(Op.Clip)
	if activation:
	activation.max = self.cell_clip
	activation.min = -self.cell_clip
	# Create add + activation
	add = create_add(f"{base_name}_add", mul_cf.ofm, mul_ci.ofm, cell_state.quantization, activation)
	add.explicit_scaling = ExplicitScaling(False, shift=[0], multiplier=[1])
	# Save new state
	self.put_state(add, Lstm.State.CELL, batch)
	# Add to debug database
	DebugDatabase.add_optimised(self.op, mul_cf)
	DebugDatabase.add_optimised(self.op, mul_ci)
	DebugDatabase.add_optimised(self.op, add)
	return add.ofm

	def calculate_output_state(self, output_gate: Tensor, cell_state: Tensor, time: int, batch: int):
	"""Generate the output state from the provided gate output"""
	# Mul( Tanh(cell state), output gate )
	base_name = f"output_state#{batch}.{time}"
	# Setup tanh quantization
	q_out_tanh = QuantizationParameters()
	q_out_tanh.scale_f32 = Q0_15_SCALE
	q_out_tanh.zero_point = 0
	# Create tanh(cell state)
	tanh = create_fused_activation(Op.Tanh, f"{base_name}_tanh", cell_state, q_out_tanh)
	self.get_state(tanh, batch)
	# Create Mul( Tanh(cell state), output gate )
	q_mul = self.output_state.quantization
	mul = create_mul(f"{base_name}_mul", tanh.ofm, output_gate, q_mul, dtype=self.op.ifm.dtype)
	# Use explicit scaling to match reference, the following line would have been the preferred way
	# mul.forced_output_quantization = self.hidden_quantization
	out_scale = self.hidden_quantization.scale_f32
	multiplier, shift = elementwise_mul_scale(np.double(Q0_15_SCALE), np.double(Q0_15_SCALE), np.double(out_scale))
	mul.explicit_scaling = ExplicitScaling(False, [shift], [multiplier])
	# Save new state
	self.put_state(mul, Lstm.State.OUTPUT, batch)
	# Add to debug database
	DebugDatabase.add_optimised(self.op, tanh)
	DebugDatabase.add_optimised(self.op, mul)
	return mul.ofm

	def state(self, state_type: State) -> Tensor:
	"""Get state tensor from type"""
	return self.output_state if state_type == Lstm.State.OUTPUT else self.cell_state

	# Dimensions
	@property
	def n_feature(self) -> int:
	return self.op.ifm.shape[-1]

	@property
	def n_time(self) -> int:
	return self.op.ifm.shape[0 if self.time_major else 1]

	@property
	def n_batch(self) -> int:
	return self.op.ifm.shape[1 if self.time_major else 0]

	# Attributes
	@property
	def cell_clip(self) -> int:
	return self.op.attrs.get("cell_clip", 0)

	@property
	def projection_clip(self) -> int:
	return self.op.attrs.get("proj_clip", 0)

	@property
	def time_major(self) -> bool:
	return self.op.attrs.get("time_major", False)

	# Hidden (intermediate)
	@property
	def hidden_quantization(self) -> QuantizationParameters:
	return self.op.intermediates[4].quantization

	# Input weights
	@property
	def input_to_input_weights(self) -> Tensor:
	return self.op.inputs[1]

	@property
	def input_to_forget_weights(self) -> Tensor:
	return self.op.inputs[2]

	@property
	def input_to_cell_weights(self) -> Tensor:
	return self.op.inputs[3]

	@property
	def input_to_output_weights(self) -> Tensor:
	return self.op.inputs[4]

	# Recurrent weights
	@property
	def recurrent_to_input_weights(self) -> Tensor:
	return self.op.inputs[5]

	@property
	def recurrent_to_forget_weights(self) -> Tensor:
	return self.op.inputs[6]

	@property
	def recurrent_to_cell_weights(self) -> Tensor:
	return self.op.inputs[7]

	@property
	def recurrent_to_output_weights(self) -> Tensor:
	return self.op.inputs[8]

	# Peephole weights
	@property
	def cell_to_input_weights(self) -> Tensor:
	return self.op.inputs[9]

	@property
	def cell_to_forget_weights(self) -> Tensor:
	return self.op.inputs[10]

	@property
	def cell_to_output_weights(self) -> Tensor:
	return self.op.inputs[11]

	# Bias tensors
	@property
	def input_bias(self) -> Tensor:
	return self.op.inputs[12]

	@property
	def forget_bias(self) -> Tensor:
	return self.op.inputs[13]

	@property
	def cell_bias(self) -> Tensor:
	return self.op.inputs[14]

	@property
	def output_bias(self) -> Tensor:
	return self.op.inputs[15]

	# Projection tensors
	@property
	def projection_weights(self) -> Tensor:
	return self.op.inputs[16]

	@property
	def projection_bias(self) -> Tensor:
	return self.op.inputs[17]

	# State tensors (variable)
	@property
	def output_state(self) -> Tensor:
	return self.op.inputs[Lstm.State.OUTPUT.value]

	@property
	def cell_state(self) -> Tensor:
	return self.op.inputs[Lstm.State.CELL.value]