blob: 954f224424c1d7480ac50921670e18efa6b847c9 [file] [log] [blame]
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +01001/*
Sheri Zhang7e20e292021-02-02 11:49:34 +00002 * Copyright (c) 2020-2021 Arm Limited.
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#ifndef ARM_COMPUTE_CLQLSTMLAYER_H
25#define ARM_COMPUTE_CLQLSTMLAYER_H
26
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +010027#include "arm_compute/core/Types.h"
28#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
Sheri Zhang7e20e292021-02-02 11:49:34 +000029#include "arm_compute/runtime/CL/functions/CLCopy.h"
Michalis Spyrouad7515d2020-07-24 00:02:23 +010030#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +010031#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
32#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
Michalis Spyrou1009e872020-07-27 12:48:34 +010033#include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +010034#include "arm_compute/runtime/CL/functions/CLTranspose.h"
35
36#include "arm_compute/runtime/common/LSTMParams.h"
37
38namespace arm_compute
39{
40// Forward declarations
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +010041class CLCompileContext;
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +010042class ICLTensor;
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +010043class CLGEMMLowpMatrixAReductionKernel;
44class CLQLSTMLayerNormalizationKernel;
45class ITensorInfo;
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +010046
47/** Basic function to run @ref CLQLSTMLayer
48 *
49 * This function calls the following CL functions/kernels:
50 *
51 * -# @ref CLActivationLayer Activation functions (tanh and logistic)
Sheri Zhang7e20e292021-02-02 11:49:34 +000052 * -# @ref CLCopy Copy function for copying output_state_out to output
53 * -# @ref CLArithmeticAddition Elementwise addition and subtraction
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +010054 * -# @ref CLGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers
55 * -# @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint Convert 32-bit integers into QSYMM16
56 * -# @ref CLGEMMLowpMatrixAReductionKernel For precomputing effective biases to use
Sheri Zhang7e20e292021-02-02 11:49:34 +000057 * -# @ref CLPixelWiseMultiplication Elementwise multiplication
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +010058 * -# @ref CLTranspose Transpose function for reshaping the weights
59 * */
60class CLQLSTMLayer : public IFunction
61{
62public:
63 /** Default constructor */
64 CLQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
65 /** Prevent instances of this class from being copied (As this class contains pointers) */
66 CLQLSTMLayer(const CLQLSTMLayer &) = delete;
67 /** Default move constructor */
68 CLQLSTMLayer(CLQLSTMLayer &&) = default;
69 /** Prevent instances of this class from being copied (As this class contains pointers) */
70 CLQLSTMLayer &operator=(const CLQLSTMLayer &) = delete;
71 /** Default move assignment operator */
72 CLQLSTMLayer &operator=(CLQLSTMLayer &&) = default;
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +010073 /** Default destructor */
74 ~CLQLSTMLayer();
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +010075 /** Initialize function's tensors.
76 *
77 * @param[in] input Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: QASYMM8_SIGNED.
78 * @param[in] input_to_forget_weights 2D weights tensor with dimensions [input_size, num_units]. Data type supported: QSYMM8.
79 * @param[in] input_to_cell_weights 2D weights tensor with dimensions [input_size, num_units]. Data type supported: QSYMM8.
80 * @param[in] input_to_output_weights 2D weights tensor with dimensions [input_size, num_units]. Data type supported: QSYMM8.
81 * @param[in] recurrent_to_forget_weights 2D weights tensor with dimensions [output_size, num_units]. Data type supported: QSYMM8.
82 * @param[in] recurrent_to_cell_weights 2D weights tensor with dimensions [output_size, num_units]. Data type supported: QSYMM8.
83 * @param[in] recurrent_to_output_weights 2D weights tensor with dimensions [output_size, num_units]. Data type supported: QSYMM8.
84 * @param[in] forget_gate_bias 1D weights tensor with dimensions [num_units]. Data type supported: S32.
85 * @param[in] cell_bias 1D weights tensor with dimensions [num_units]. Data type supported: S32.
86 * @param[in] output_gate_bias 1D weights tensor with dimensions [num_units]. Data type supported: S32.
Michele Di Giorgiobeb2d452020-05-11 16:17:51 +010087 * @param[in] cell_state_in 2D tensor with dimensions [num_units, batch_size]. Data type supported: QSYMM16.
88 * @param[in] output_state_in 2D tensor with dimensions [output_size, batch_size]. Data type supported: Same as @p input.
89 * @param[out] cell_state_out Destination tensor. Output is a 2D tensor with dimensions [num_units, batch_size]. Data type supported: QSYMM16.
90 * @param[out] output_state_out Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size].Data types supported: Same as @p input.
91 * @param[out] output Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size].Data types supported: Same as @p input.
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +010092 * @param[in] lstm_params Weights tensors used in peephole, CIFG and layer normalization optimizations:
93 * input_intermediate_scale Scale of the intermediate result of matmul, i.e. input to layer normalization, at input gate.
94 * forget_intermediate_scale Scale of the intermediate result of matmul, i.e. input to layer normalization, at forget gate.
95 * cell_intermediate_scale Scale of the intermediate result of matmul, i.e. input to layer normalization, at cell gate.
96 * output_intermediate_scale Scale of the intermediate result of matmul, i.e. input to layer normalization, at output gate.
97 * hidden_state_zero The zero point of the hidden state.
98 * hidden_state_scale The scale of the hidden state.
99 * input_to_input_weights (Optional) 2D weights tensor with dimensions [input_size, num_units]. Data type supported: QSYMM8.
100 * recurrent_to_input_weights (Optional) 2D weights tensor with dimensions [output_size, num_units]. Data type supported: QSYMM8.
101 * cell_to_input_weights (Optional) 1D weights tensor with dimensions [num_units]. Can be nullptr. Data type supported: QSYMM16.
102 * cell_to_forget_weights (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: QSYMM16.
103 * cell_to_output_weights (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: QSYMM16.
104 * input_gate_bias (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: S32.
105 * projection_weights (Optional) 2D weights tensor with dimensions [output_size, num_units]. Data type supported: QSYMM8.
106 * projection_bias (Optional) 1D weights tensor with dimensions [output_size]. S32.
107 * input_layer_norm_weights (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: QSYMM16.
108 * forget_layer_norm_weights (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: QSYMM16.
109 * cell_layer_norm_weights (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: QSYMM16.
110 * output_layer_norm_weights (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: QSYMM16.
111 * cell_threshold (Optional) The clipping threshold for the cell state, such that values are bound within [-cell_clip, cell_clip].
112 * If set to 0.0 then clipping is disabled.
113 * projection_threshold (Optional) The clipping threshold for the output from the projection layer, such that values are bound within
114 * [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
115 */
116 void configure(const ICLTensor *input,
117 const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
118 const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
119 const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
Sang-Hoon Park840a72c2020-09-23 13:24:13 +0100120 ICLTensor *cell_state_in, ICLTensor *output_state_in,
Michele Di Giorgiobeb2d452020-05-11 16:17:51 +0100121 ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +0100122 const LSTMParams<ICLTensor> &lstm_params);
123
Manuel Bottini2b84be52020-04-08 10:15:51 +0100124 /** Initialize function's tensors.
125 *
126 * @param[in] compile_context The compile context to be used.
127 * @param[in] input Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: QASYMM8_SIGNED.
128 * @param[in] input_to_forget_weights 2D weights tensor with dimensions [input_size, num_units]. Data type supported: QSYMM8.
129 * @param[in] input_to_cell_weights 2D weights tensor with dimensions [input_size, num_units]. Data type supported: QSYMM8.
130 * @param[in] input_to_output_weights 2D weights tensor with dimensions [input_size, num_units]. Data type supported: QSYMM8.
131 * @param[in] recurrent_to_forget_weights 2D weights tensor with dimensions [output_size, num_units]. Data type supported: QSYMM8.
132 * @param[in] recurrent_to_cell_weights 2D weights tensor with dimensions [output_size, num_units]. Data type supported: QSYMM8.
133 * @param[in] recurrent_to_output_weights 2D weights tensor with dimensions [output_size, num_units]. Data type supported: QSYMM8.
134 * @param[in] forget_gate_bias 1D weights tensor with dimensions [num_units]. Data type supported: S32.
135 * @param[in] cell_bias 1D weights tensor with dimensions [num_units]. Data type supported: S32.
136 * @param[in] output_gate_bias 1D weights tensor with dimensions [num_units]. Data type supported: S32.
Michele Di Giorgiobeb2d452020-05-11 16:17:51 +0100137 * @param[in] cell_state_in 2D tensor with dimensions [num_units, batch_size]. Data type supported: QSYMM16.
138 * @param[in] output_state_in 2D tensor with dimensions [output_size, batch_size]. Data type supported: Same as @p input.
139 * @param[out] cell_state_out Destination tensor. Output is a 2D tensor with dimensions [num_units, batch_size]. Data type supported: QSYMM16.
140 * @param[out] output_state_out Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size].Data types supported: Same as @p input.
141 * @param[out] output Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size].Data types supported: Same as @p input.
Manuel Bottini2b84be52020-04-08 10:15:51 +0100142 * @param[in] lstm_params Weights tensors used in peephole, CIFG and layer normalization optimizations:
143 * input_intermediate_scale Scale of the intermediate result of matmul, i.e. input to layer normalization, at input gate.
144 * forget_intermediate_scale Scale of the intermediate result of matmul, i.e. input to layer normalization, at forget gate.
145 * cell_intermediate_scale Scale of the intermediate result of matmul, i.e. input to layer normalization, at cell gate.
146 * output_intermediate_scale Scale of the intermediate result of matmul, i.e. input to layer normalization, at output gate.
147 * hidden_state_zero The zero point of the hidden state.
148 * hidden_state_scale The scale of the hidden state.
149 * input_to_input_weights (Optional) 2D weights tensor with dimensions [input_size, num_units]. Data type supported: QSYMM8.
150 * recurrent_to_input_weights (Optional) 2D weights tensor with dimensions [output_size, num_units]. Data type supported: QSYMM8.
151 * cell_to_input_weights (Optional) 1D weights tensor with dimensions [num_units]. Can be nullptr. Data type supported: QSYMM16.
152 * cell_to_forget_weights (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: QSYMM16.
153 * cell_to_output_weights (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: QSYMM16.
154 * input_gate_bias (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: S32.
155 * projection_weights (Optional) 2D weights tensor with dimensions [output_size, num_units]. Data type supported: QSYMM8.
156 * projection_bias (Optional) 1D weights tensor with dimensions [output_size]. S32.
157 * input_layer_norm_weights (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: QSYMM16.
158 * forget_layer_norm_weights (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: QSYMM16.
159 * cell_layer_norm_weights (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: QSYMM16.
160 * output_layer_norm_weights (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: QSYMM16.
161 * cell_threshold (Optional) The clipping threshold for the cell state, such that values are bound within [-cell_clip, cell_clip].
162 * If set to 0.0 then clipping is disabled.
163 * projection_threshold (Optional) The clipping threshold for the output from the projection layer, such that values are bound within
164 * [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
165 */
166 void configure(const CLCompileContext &compile_context, const ICLTensor *input,
167 const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
168 const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
169 const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
Sang-Hoon Park840a72c2020-09-23 13:24:13 +0100170 ICLTensor *cell_state_in, ICLTensor *output_state_in,
Michele Di Giorgiobeb2d452020-05-11 16:17:51 +0100171 ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
Manuel Bottini2b84be52020-04-08 10:15:51 +0100172 const LSTMParams<ICLTensor> &lstm_params);
173
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +0100174 /** Static function to check if given info will lead to a valid configuration of @ref CLQLSTMLayer
175 *
Michele Di Giorgiobeb2d452020-05-11 16:17:51 +0100176 * @param[in] input Source tensor info. Input is a 2D tensor info with dimensions [input_size, batch_size]. Data types supported: QASYMM8_SIGNED.
177 * @param[in] input_to_forget_weights 2D weights tensor info with dimensions [input_size, num_units]. Data type supported: QSYMM8.
178 * @param[in] input_to_cell_weights 2D weights tensor info with dimensions [input_size, num_units]. Data type supported: QSYMM8.
179 * @param[in] input_to_output_weights 2D weights tensor info with dimensions [input_size, num_units]. Data type supported: QSYMM8.
180 * @param[in] recurrent_to_forget_weights 2D weights tensor info with dimensions [output_size, num_units]. Data type supported: QSYMM8.
181 * @param[in] recurrent_to_cell_weights 2D weights tensor info with dimensions [output_size, num_units]. Data type supported: QSYMM8.
182 * @param[in] recurrent_to_output_weights 2D weights tensor info with dimensions [output_size, num_units]. Data type supported: QSYMM8.
183 * @param[in] forget_gate_bias 1D weights tensor info with dimensions [num_units]. Data type supported: S32.
184 * @param[in] cell_bias 1D weights tensor info with dimensions [num_units]. Data type supported: S32.
185 * @param[in] output_gate_bias 1D weights tensor info with dimensions [num_units]. Data type supported: S32.
186 * @param[in] cell_state_in 2D tensor info with dimensions [num_units, batch_size]. Data type supported: QSYMM16.
187 * @param[in] output_state_in 2D tensor info with dimensions [output_size, batch_size]. Data type supported: Same as @p input.
188 * @param[in] cell_state_out Destination tensor info. Output is a 2D tensor info with dimensions [num_units, batch_size]. Data type supported: QSYMM16.
189 * @param[in] output_state_out Destination tensor info. Output is a 2D tensor info with dimensions [output_size, batch_size].Data types supported: Same as @p input.
190 * @param[in] output Destination tensor info. Output is a 2D tensor info with dimensions [output_size, batch_size].Data types supported: Same as @p input.
191 * @param[in] lstm_params Weights tensors info used in peephole, CIFG and layer normalization optimizations:
192 * input_intermediate_scale Scale of the intermediate result of matmul, i.e. input to layer normalization, at input gate.
193 * forget_intermediate_scale Scale of the intermediate result of matmul, i.e. input to layer normalization, at forget gate.
194 * cell_intermediate_scale Scale of the intermediate result of matmul, i.e. input to layer normalization, at cell gate.
195 * output_intermediate_scale Scale of the intermediate result of matmul, i.e. input to layer normalization, at output gate.
196 * hidden_state_zero The zero point of the hidden state.
197 * hidden_state_scale The scale of the hidden state.
198 * input_to_input_weights (Optional) 2D weights tensor with dimensions [input_size, num_units]. Data type supported: QSYMM8.
199 * recurrent_to_input_weights (Optional) 2D weights tensor with dimensions [output_size, num_units]. Data type supported: QSYMM8.
200 * cell_to_input_weights (Optional) 1D weights tensor with dimensions [num_units]. Can be nullptr. Data type supported: QSYMM16.
201 * cell_to_forget_weights (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: QSYMM16.
202 * cell_to_output_weights (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: QSYMM16.
203 * input_gate_bias (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: S32.
204 * projection_weights (Optional) 2D weights tensor with dimensions [output_size, num_units]. Data type supported: QSYMM8.
205 * projection_bias (Optional) 1D weights tensor with dimensions [output_size]. S32.
206 * input_layer_norm_weights (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: QSYMM16.
207 * forget_layer_norm_weights (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: QSYMM16.
208 * cell_layer_norm_weights (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: QSYMM16.
209 * output_layer_norm_weights (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: QSYMM16.
210 * cell_threshold (Optional) The clipping threshold for the cell state, such that values are bound within [-cell_clip, cell_clip].
211 * If set to 0.0 then clipping is disabled.
212 * projection_threshold (Optional) The clipping threshold for the output from the projection layer, such that values are bound within
213 * [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +0100214 * @return a status
215 */
216 static Status validate(const ITensorInfo *input,
217 const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
218 const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
219 const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
220 const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
Michele Di Giorgiobeb2d452020-05-11 16:17:51 +0100221 const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output,
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +0100222 const LSTMParams<ITensorInfo> &lstm_params);
223
224 // Inherited methods overridden:
225 void run() override;
226 void prepare() override;
227
228private:
Sheri Zhang3a353982020-04-21 13:10:24 +0100229 enum class LayerNormGate : uint8_t
230 {
231 Forget,
232 Cell,
233 Input,
234 Output,
235 Count
236 };
Sang-Hoon Parka7431ae2020-05-12 11:13:30 +0100237 static constexpr uint8_t _layer_norm_count = static_cast<uint8_t>(LayerNormGate::Count);
238 static constexpr uint32_t _out_state_output_size_dimension_idx = 0;
Sheri Zhang3a353982020-04-21 13:10:24 +0100239
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +0100240 /** Internal method to configure matrix multiplication plus output stage of each gate.
241 *
Manuel Bottini2b84be52020-04-08 10:15:51 +0100242 * @param[in] compile_context The compile context to be used.
243 * @param[in] mm Matrix multiplication function to use.
244 * @param[in] outstage Output stage function to use.
245 * @param[in] gemmlowp_info GEMMLowp metadata to be used by the output stage.
246 * @param[in] mm_input Input tensor to matrix multiplication function.
247 * @param[in] mm_weights Weights tensor to matrix multiplication function.
248 * @param[in] bias Bias tensor to matrix multiplication function.
249 * @param[in] outstage_res Tensor to be used for storing the result of the output stage.
250 * @param[in] gemmlowp_scale Real multiplier to be used computing multiplier and shift for requantization.
251 * @param[in] mm_res_info Tensor info to be used to initialize matrix multiplication result tensor.
252 * @param[in] mm_res_info Tensor info to be used to initialize output stage result tensor.
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +0100253 *
254 */
Manuel Bottini2b84be52020-04-08 10:15:51 +0100255 void configure_mm(const CLCompileContext &compile_context, CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +0100256 const ICLTensor *mm_input, const ICLTensor *mm_weights, const ICLTensor *bias, CLTensor *mm_res,
257 CLTensor *outstage_res, float gemmlowp_scale,
258 const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info);
259
260 MemoryGroup _memory_group{};
261
Sang-Hoon Parka7431ae2020-05-12 11:13:30 +0100262 /** A small internel kernel do the copy between two tensors */
263 class TensorCopyKernel
264 {
265 static constexpr uint32_t max_dimension_supported = 2;
266
267 ICLTensor *_src{ nullptr };
268 ICLTensor *_dst{ nullptr };
269 size_t _row_size{};
270 Window _window{};
271
272 public:
273 /** Static function to check if given info will lead to a valid configuration of @ref CLQLSTMLayer::TensorCopyKernel
274 *
275 * @param[in] src Source tensor info.
276 * @param[in] dst Destination tensor info
277 *
278 * @return a status
279 */
280 static Status validate(const ITensorInfo &src, const ITensorInfo &dst);
281 /** Set the input and output tensors.
282 *
283 * @param[in] src Source tensor
284 * @param[out] dst Destination tensor
285 */
286 void configure(ICLTensor &src, ICLTensor &dst);
287 /** run the kernel */
288 void run();
289 };
290
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +0100291 // Functions used
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +0100292 CLTranspose _transpose_input_to_forget_weights{};
293 CLTranspose _transpose_input_to_cell_weights{};
294 CLTranspose _transpose_input_to_output_weights{};
295 CLTranspose _transpose_input_to_input_weights{};
296 CLTranspose _transpose_recurrent_to_forget_weights{};
297 CLTranspose _transpose_recurrent_to_cell_weights{};
298 CLTranspose _transpose_recurrent_to_output_weights{};
299 CLTranspose _transpose_recurrent_to_input_weights{};
300 CLTranspose _transpose_projection_weights{};
301 std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _input_to_input_reduction;
302 std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _recurrent_to_input_reduction;
303 std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _input_to_forget_reduction;
304 std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _recurrent_to_forget_reduction;
305 std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _input_to_cell_reduction;
306 std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _recurrent_to_cell_reduction;
307 std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _input_to_output_reduction;
308 std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _recurrent_to_output_reduction;
309 std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _projection_reduction;
310 CLArithmeticAddition _projection_bias_add{};
311 CLGEMMLowpMatrixMultiplyCore _mm_input_to_forget{};
312 CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget{};
313 CLPixelWiseMultiplication _pixelwise_mul_cell_to_forget{};
314 CLGEMMLowpOutputStage _input_to_forget_outstage{};
315 CLGEMMLowpOutputStage _recurrent_to_forget_outstage{};
316 CLGEMMLowpOutputStage _cell_to_forget_outstage{};
317 CLArithmeticAddition _accumulate_input_recurrent_forget{};
318 CLArithmeticAddition _accumulate_cell_forget{};
319 CLActivationLayer _forget_gate_sigmoid{};
320 CLGEMMLowpMatrixMultiplyCore _mm_input_to_cell{};
321 CLGEMMLowpOutputStage _input_to_cell_outstage{};
322 CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_cell{};
323 CLGEMMLowpOutputStage _recurrent_to_cell_outstage{};
324 CLArithmeticAddition _accumulate_input_recurrent_modulation{};
325 CLActivationLayer _cell_gate_tanh{};
326 CLArithmeticSubtraction _input_gate_sub{};
327 CLGEMMLowpMatrixMultiplyCore _mm_input_to_input{};
328 CLGEMMLowpOutputStage _input_to_input_outstage{};
329 CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input{};
330 CLGEMMLowpOutputStage _recurrent_to_input_outstage{};
331 CLArithmeticAddition _accumulate_input_recurrent_input{};
332 CLPixelWiseMultiplication _pixelwise_mul_cell_to_input{};
333 CLGEMMLowpOutputStage _cell_to_input_outstage{};
334 CLArithmeticAddition _accumulate_cell_input{};
335 CLActivationLayer _input_gate_sigmoid{};
336 CLPixelWiseMultiplication _pixelwise_mul_forget_cell{};
337 CLPixelWiseMultiplication _pixelwise_mul_input_cell{};
338 CLArithmeticAddition _add_forget_cell{};
339 CLActivationLayer _cell_clip{};
340 CLGEMMLowpMatrixMultiplyCore _mm_input_to_output{};
341 CLGEMMLowpOutputStage _input_to_output_outstage{};
342 CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output{};
343 CLGEMMLowpOutputStage _recurrent_to_output_outstage{};
344 CLArithmeticAddition _accumulate_input_recurrent_output{};
345 CLPixelWiseMultiplication _pixelwise_mul_cell_to_output{};
346 CLGEMMLowpOutputStage _cell_to_output_outstage{};
347 CLArithmeticAddition _accumulate_cell_to_output{};
348 CLActivationLayer _output_gate_sigmoid{};
349 CLActivationLayer _hidden_tanh{};
350 CLPixelWiseMultiplication _pixelwise_mul_hidden{};
351 CLGEMMLowpOutputStage _hidden_outstage{};
352 CLGEMMLowpMatrixMultiplyCore _mm_projection{};
353 CLGEMMLowpOutputStage _projection_outstage{};
354 CLArithmeticAddition _accumulate_projection{};
355 CLActivationLayer _projection_clip{};
356 std::array<std::unique_ptr<CLQLSTMLayerNormalizationKernel>, _layer_norm_count> _layer_norms;
Sheri Zhang7e20e292021-02-02 11:49:34 +0000357 CLCopy _copy_output;
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +0100358
Sang-Hoon Parka7431ae2020-05-12 11:13:30 +0100359 TensorCopyKernel _projection_bias_copy{};
360 TensorCopyKernel _projection_output_to_accumulate_copy{};
361 TensorCopyKernel _projection_accumulate_to_output_copy{};
362 TensorCopyKernel _hidden_to_output_copy{};
363
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +0100364 // Tensor pointers
Michalis Spyrouad7515d2020-07-24 00:02:23 +0100365 const ICLTensor *_input_to_input_weights
366 {
367 nullptr
368 };
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +0100369 const ICLTensor *_recurrent_to_input_weights{ nullptr };
370 const ICLTensor *_projection_bias{ nullptr };
371 const ICLTensor *_input_to_forget_weights{ nullptr };
372 const ICLTensor *_input_to_cell_weights{ nullptr };
373 const ICLTensor *_input_to_output_weights{ nullptr };
374 const ICLTensor *_recurrent_to_forget_weights{ nullptr };
375 const ICLTensor *_recurrent_to_cell_weights{ nullptr };
376 const ICLTensor *_recurrent_to_output_weights{ nullptr };
377 const ICLTensor *_projection_weights{ nullptr };
Sheri Zhang3a353982020-04-21 13:10:24 +0100378 std::array<const ICLTensor *, _layer_norm_count> _layer_norm_weights{ {} };
379 std::array<const ICLTensor *, _layer_norm_count> _layer_norm_bias{ {} };
380
381 using LayerNormIndexType = typename std::underlying_type<LayerNormGate>::type;
382 inline LayerNormIndexType getGateIndex(LayerNormGate g)
383 {
384 return static_cast<LayerNormIndexType>(g);
385 }
386
387 inline void set_layer_norm_weight(const ICLTensor *t, LayerNormGate g)
388 {
389 _layer_norm_weights[getGateIndex(g)] = t;
390 }
391
392 inline void set_layer_norm_bias(const ICLTensor *t, LayerNormGate g)
393 {
394 _layer_norm_bias[getGateIndex(g)] = t;
395 }
396
397 inline const ICLTensor *get_layer_norm_weight(LayerNormGate g)
398 {
399 return _layer_norm_weights[getGateIndex(g)];
400 }
401
402 inline const ICLTensor *get_layer_norm_bias(LayerNormGate g)
403 {
404 return _layer_norm_bias[getGateIndex(g)];
405 }
406
407 inline CLQLSTMLayerNormalizationKernel &get_layer_norm(LayerNormGate g)
408 {
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +0100409 return *_layer_norms[getGateIndex(g)];
Sheri Zhang3a353982020-04-21 13:10:24 +0100410 }
411
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +0100412 inline void configure_layer_norm(LayerNormGate g, const ICLTensor *in);
413 inline static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias);
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +0100414
415 // Temporary tensors
416 CLTensor _input_to_forget_weights_transposed{ nullptr };
417 CLTensor _input_to_cell_weights_transposed{ nullptr };
418 CLTensor _input_to_output_weights_transposed{ nullptr };
419 CLTensor _input_to_input_weights_transposed{ nullptr };
420 CLTensor _recurrent_to_forget_weights_transposed{ nullptr };
421 CLTensor _recurrent_to_cell_weights_transposed{ nullptr };
422 CLTensor _recurrent_to_output_weights_transposed{ nullptr };
423 CLTensor _recurrent_to_input_weights_transposed{ nullptr };
424 CLTensor _projection_weights_transposed{ nullptr };
425 CLTensor _input_to_input_eff_bias{ nullptr };
426 CLTensor _recurrent_to_input_eff_bias{ nullptr };
427 CLTensor _input_to_forget_eff_bias{ nullptr };
428 CLTensor _recurrent_to_forget_eff_bias{ nullptr };
429 CLTensor _input_to_cell_eff_bias{ nullptr };
430 CLTensor _recurrent_to_cell_eff_bias{ nullptr };
431 CLTensor _input_to_output_eff_bias{ nullptr };
432 CLTensor _recurrent_to_output_eff_bias{ nullptr };
433 CLTensor _projection_reduction_res{ nullptr };
434 CLTensor _projection_eff_bias{ nullptr };
435 CLTensor _mm_input_to_forget_res{ nullptr };
436 CLTensor _mm_recurrent_to_forget_res{ nullptr };
437 CLTensor _mul_cell_to_forget_res{ nullptr };
438 CLTensor _input_to_forget_outstage_res{ nullptr };
439 CLTensor _cell_to_forget_outstage_res{ nullptr };
440 CLTensor _recurrent_to_forget_outstage_res{ nullptr };
441 CLTensor _forget_gate{ nullptr };
442 CLTensor _mm_input_to_cell_res{ nullptr };
443 CLTensor _input_to_cell_outstage_res{ nullptr };
444 CLTensor _mm_recurrent_to_cell_res{ nullptr };
445 CLTensor _recurrent_to_cell_outstage_res{ nullptr };
446 CLTensor _cell_gate{ nullptr };
447 CLTensor _mul_input_cell_res{ nullptr };
448 CLTensor _mm_input_to_input_res{ nullptr };
449 CLTensor _input_to_input_outstage_res{ nullptr };
450 CLTensor _mm_recurrent_to_input_res{ nullptr };
451 CLTensor _mul_cell_to_input_res{ nullptr };
452 CLTensor _cell_to_input_outstage_res{ nullptr };
453 CLTensor _recurrent_to_input_outstage_res{ nullptr };
454 CLTensor _input_gate{ nullptr };
455 CLTensor _mm_input_to_output_res{ nullptr };
456 CLTensor _input_to_output_outstage_res{ nullptr };
457 CLTensor _mm_recurrent_to_output_res{ nullptr };
458 CLTensor _mul_cell_to_output_res{ nullptr };
Sang-Hoon Parka7431ae2020-05-12 11:13:30 +0100459 CLTensor _cell_to_output_outstage_res{ nullptr };
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +0100460 CLTensor _recurrent_to_output_outstage_res{ nullptr };
461 CLTensor _output_gate{ nullptr };
462 CLTensor _hidden_mul_res{ nullptr };
Sang-Hoon Parka7431ae2020-05-12 11:13:30 +0100463 CLTensor _hidden_gate{ nullptr };
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +0100464 CLTensor _mm_projection_res{ nullptr };
465 CLTensor _projection_outstage_res{ nullptr };
Sang-Hoon Parka7431ae2020-05-12 11:13:30 +0100466 CLTensor _projection_out_res{ nullptr };
467 CLTensor _projection_accumulate_res{ nullptr };
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +0100468 CLTensor _ones{ nullptr };
Sheri Zhang3a353982020-04-21 13:10:24 +0100469 std::array<CLTensor, _layer_norm_count> _layer_norm_output{ {} };
470
471 inline CLTensor &get_layer_norm_output(LayerNormGate g)
472 {
473 return _layer_norm_output[getGateIndex(g)];
474 }
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +0100475
476 bool _is_prepared{ false };
477 bool _has_cifg{ false };
478 bool _has_cell_clipping{ false };
479 bool _has_projection{ false };
480 bool _has_projection_clipping{ false };
481 bool _has_peephole{ false };
Sheri Zhang3a353982020-04-21 13:10:24 +0100482 bool _has_layer_norm{ false };
Sang-Hoon Parka7431ae2020-05-12 11:13:30 +0100483 bool _projection_tensor_copy_required{ false };
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +0100484};
485} // namespace arm_compute
486#endif /* ARM_COMPUTE_CLQLSTMLAYER_H */