COMPMID-3439: Fix peephole and projection in CLQLSTMLayer The followings are essential to make it work - QSYMM16 is added as supported data type in CLGEMMLowpOutputStage - Internal TensorCopyKernel is added similar to NEQLSTMLayer The followings are fix for related things. - Projection is modified to remove copy of projection_bias from NEQLSTMLayer. - Fix wrong argument for validate_mm() - validate_mm() now returns on error. Change-Id: Icbd04e9fdb8821eb41dd3e0a6a0980965b779714 Signed-off-by: Sang-Hoon Park <sang-hoon.park@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3177 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>

commit: a7431aeef244c85f621b70b946d25229e42d1708 [log] [tgz]
author: Sang-Hoon Park <sang-hoon.park@arm.com> Tue May 12 11:13:30 2020 +0100
committer: Sang-Hoon Park <sang-hoon.park@arm.com> Tue May 12 16:25:57 2020 +0000
tree: 62f74403008cad9cb812202865d016addf711a18
parent: 1f567afcdfb2919fab417f0060155deda7132df8 [diff] [blame]
diff --git a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
index 219f46e..67e8bc7 100644
--- a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
+++ b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h

@@ -230,7 +230,8 @@
         Output,
         Count
     };
-    static constexpr uint8_t _layer_norm_count = static_cast<uint8_t>(LayerNormGate::Count);
+    static constexpr uint8_t  _layer_norm_count                    = static_cast<uint8_t>(LayerNormGate::Count);
+    static constexpr uint32_t _out_state_output_size_dimension_idx = 0;
 
     /** Internal method to configure matrix multiplication plus output stage of each gate.
      *
@@ -254,6 +255,35 @@
 
     MemoryGroup _memory_group{};
 
+    /** A small internel kernel do the copy between two tensors */
+    class TensorCopyKernel
+    {
+        static constexpr uint32_t max_dimension_supported = 2;
+
+        ICLTensor *_src{ nullptr };
+        ICLTensor *_dst{ nullptr };
+        size_t     _row_size{};
+        Window     _window{};
+
+    public:
+        /** Static function to check if given info will lead to a valid configuration of @ref CLQLSTMLayer::TensorCopyKernel
+         *
+         * @param[in] src Source tensor info.
+         * @param[in] dst Destination tensor info
+         *
+         * @return a status
+         */
+        static Status validate(const ITensorInfo &src, const ITensorInfo &dst);
+        /** Set the input and output tensors.
+         *
+         * @param[in]  src Source tensor
+         * @param[out] dst Destination tensor
+         */
+        void configure(ICLTensor &src, ICLTensor &dst);
+        /** run the kernel */
+        void run();
+    };
+
     // Functions used
     CLTranspose                          _transpose_input_to_forget_weights{};
     CLTranspose                          _transpose_input_to_cell_weights{};
@@ -298,7 +328,7 @@
     CLPixelWiseMultiplicationKernel      _pixelwise_mul_cell_to_input{};
     CLGEMMLowpOutputStage                _cell_to_input_outstage{};
     CLSaturatedArithmeticOperationKernel _accumulate_cell_input{};
-    CLActivationLayer                    _input_gate_tanh{};
+    CLActivationLayer                    _input_gate_sigmoid{};
     CLPixelWiseMultiplicationKernel      _pixelwise_mul_forget_cell{};
     CLPixelWiseMultiplicationKernel      _pixelwise_mul_input_cell{};
     CLSaturatedArithmeticOperationKernel _add_forget_cell{};
@@ -309,6 +339,7 @@
     CLGEMMLowpOutputStage                _recurrent_to_output_outstage{};
     CLSaturatedArithmeticOperationKernel _accumulate_input_recurrent_output{};
     CLPixelWiseMultiplicationKernel      _pixelwise_mul_cell_to_output{};
+    CLGEMMLowpOutputStage                _cell_to_output_outstage{};
     CLSaturatedArithmeticOperationKernel _accumulate_cell_to_output{};
     CLActivationLayer                    _output_gate_sigmoid{};
     CLActivationLayer                    _hidden_tanh{};
@@ -321,11 +352,13 @@
     std::array<CLQLSTMLayerNormalizationKernel, _layer_norm_count> _layer_norms{ {} };
     CLCopyKernel _copy_output{};
 
+    TensorCopyKernel _projection_bias_copy{};
+    TensorCopyKernel _projection_output_to_accumulate_copy{};
+    TensorCopyKernel _projection_accumulate_to_output_copy{};
+    TensorCopyKernel _hidden_to_output_copy{};
+
     // Tensor pointers
-    const ICLTensor *_input_to_input_weights
-    {
-        nullptr
-    };
+    const ICLTensor *_input_to_input_weights{ nullptr };
     const ICLTensor *_recurrent_to_input_weights{ nullptr };
     const ICLTensor *_projection_bias{ nullptr };
     const ICLTensor *_input_to_forget_weights{ nullptr };
@@ -435,11 +468,15 @@
     CLTensor _input_to_output_outstage_res{ nullptr };
     CLTensor _mm_recurrent_to_output_res{ nullptr };
     CLTensor _mul_cell_to_output_res{ nullptr };
+    CLTensor _cell_to_output_outstage_res{ nullptr };
     CLTensor _recurrent_to_output_outstage_res{ nullptr };
     CLTensor _output_gate{ nullptr };
     CLTensor _hidden_mul_res{ nullptr };
+    CLTensor _hidden_gate{ nullptr };
     CLTensor _mm_projection_res{ nullptr };
     CLTensor _projection_outstage_res{ nullptr };
+    CLTensor _projection_out_res{ nullptr };
+    CLTensor _projection_accumulate_res{ nullptr };
     CLTensor _ones{ nullptr };
     std::array<CLTensor, _layer_norm_count> _layer_norm_output{ {} };
 
@@ -455,6 +492,7 @@
     bool _has_projection_clipping{ false };
     bool _has_peephole{ false };
     bool _has_layer_norm{ false };
+    bool _projection_tensor_copy_required{ false };
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLQLSTMLAYER_H */
commit	a7431aeef244c85f621b70b946d25229e42d1708	[log] [tgz]
author	Sang-Hoon Park <sang-hoon.park@arm.com>	Tue May 12 11:13:30 2020 +0100
committer	Sang-Hoon Park <sang-hoon.park@arm.com>	Tue May 12 16:25:57 2020 +0000
tree	62f74403008cad9cb812202865d016addf711a18
parent	1f567afcdfb2919fab417f0060155deda7132df8 [diff] [blame]