IVGCVSW-7964 Fix UnidirectionalSequenceLstm

 * Fix incorrect batch size and time size
 * Fix incorrect time major when max time =1
 * Fix incorrect permutation
 * Fix incorrect scratch buffer
 * Unit tests

Signed-off-by: Narumol Prangnawarat <narumol.prangnawarat@arm.com>
Change-Id: I510fae55528be412a58d020e82bd283852e7800b
diff --git a/delegate/classic/src/UnidirectionalSequenceLstm.hpp b/delegate/classic/src/UnidirectionalSequenceLstm.hpp
index 5fa6bb0..3529640 100644
--- a/delegate/classic/src/UnidirectionalSequenceLstm.hpp
+++ b/delegate/classic/src/UnidirectionalSequenceLstm.hpp
@@ -184,7 +184,7 @@
     const armnn::TensorInfo& inputTensorInfo = GetTensorInfoForTfLiteTensor(tfLiteInputTensor);
     const armnn::TensorInfo& outputTensorInfo = GetTensorInfoForTfLiteTensor(tfLiteOutputTensor, true);
 
-    unsigned int batchSize  = inputTensorInfo.GetShape()[0];
+    unsigned int batchSize  = desc.m_TimeMajor ? inputTensorInfo.GetShape()[1] : inputTensorInfo.GetShape()[0];
     unsigned int outputSize = outputTensorInfo.GetShape()[2];
     unsigned int numUnits   = cellStateInInfo.GetShape()[1];
 
diff --git a/delegate/opaque/src/UnidirectionalSequenceLstm.hpp b/delegate/opaque/src/UnidirectionalSequenceLstm.hpp
index 2fd64c0..19a57e8 100644
--- a/delegate/opaque/src/UnidirectionalSequenceLstm.hpp
+++ b/delegate/opaque/src/UnidirectionalSequenceLstm.hpp
@@ -226,7 +226,7 @@
     const armnn::TensorInfo& inputTensorInfo = GetTensorInfoForTfLiteOpaqueTensor(tfLiteInputTensor);
     const armnn::TensorInfo& outputTensorInfo = GetTensorInfoForTfLiteOpaqueTensor(tfLiteOutputTensor, true);
 
-    unsigned int batchSize  = inputTensorInfo.GetShape()[0];
+    unsigned int batchSize  = desc.m_TimeMajor ? inputTensorInfo.GetShape()[1] : inputTensorInfo.GetShape()[0];
     unsigned int outputSize = outputTensorInfo.GetShape()[2];
     unsigned int numUnits   = cellStateInInfo.GetShape()[1];
 
diff --git a/delegate/test/UnidirectionalSequenceLstmTest.cpp b/delegate/test/UnidirectionalSequenceLstmTest.cpp
index 6d896d7..7fd43fc 100644
--- a/delegate/test/UnidirectionalSequenceLstmTest.cpp
+++ b/delegate/test/UnidirectionalSequenceLstmTest.cpp
@@ -157,10 +157,12 @@
                                               isTimeMajor);
 }
 
-void UnidirectionalSequenceLstmTimeMajorTest(std::vector<armnn::BackendId>& backends)
+void UnidirectionalSequenceLstmTimeMajorTestImpl(std::vector<armnn::BackendId>& backends,
+                                                 int32_t timeSize,
+                                                 std::vector<float>& inputValues,
+                                                 std::vector<float>& expectedOutputValues)
 {
     int32_t batchSize = 3;
-    int32_t timeSize = 2;
     int32_t inputSize = 3;
     int32_t outputSize = 4;
     // cellSize and outputSize have the same size when there is no projection.
@@ -243,16 +245,6 @@
     bool hasOutputLayerNormWeights = false;
     std::vector<float> outputLayerNormWeights;
 
-    std::vector<float> inputValues = { 1., 2., 3., 4., 5., 4.,
-                                       3., 2., 1., 2., 3., 4.,
-                                       5., 4., 3., 2., 1., 2. };
-    std::vector<float> expectedOutputValues = { 0.135658f, 0.124673f, 0.021209f, -0.0530204f,
-                                                0.106138f, 0.0404792f, 0.0151644f, -0.00675166f,
-                                                -0.0128514f, 0.0644884f, 0.0709072f, -0.0454045f,
-                                                0.162886f, 0.166494f, 0.0277046f, -0.0369807f,
-                                                0.111716f, 0.043119f, 0.0762981f, -0.0122854f,
-                                                0.104397f, 0.2144f, 0.119192f, -0.0839058f };
-
     tflite::ActivationFunctionType activationFunction = tflite::ActivationFunctionType_TANH;
     float clippingThresCell = 10.f;
     float clippingThresProj = 0.f;
@@ -303,7 +295,45 @@
                                               activationFunction,
                                               clippingThresCell,
                                               clippingThresProj,
-                                              isTimeMajor);
+                                              isTimeMajor);}
+
+void UnidirectionalSequenceLstmTimeMajorTest(std::vector<armnn::BackendId>& backends)
+{
+    int32_t timeSize = 2;
+
+    std::vector<float> inputValues = { 1., 2., 3., 4., 5., 4.,
+                                       3., 2., 1., 2., 3., 4.,
+                                       5., 4., 3., 2., 1., 2. };
+
+    std::vector<float> expectedOutputValues = { 0.135658f, 0.124673f, 0.021209f, -0.0530204f,
+                                                0.106138f, 0.0404792f, 0.0151644f, -0.00675166f,
+                                                -0.0128514f, 0.0644884f, 0.0709072f, -0.0454045f,
+                                                0.162886f, 0.166494f, 0.0277046f, -0.0369807f,
+                                                0.111716f, 0.043119f, 0.0762981f, -0.0122854f,
+                                                0.104397f, 0.2144f, 0.119192f, -0.0839058f };
+
+    UnidirectionalSequenceLstmTimeMajorTestImpl(backends,
+                                                timeSize,
+                                                inputValues,
+                                                expectedOutputValues);
+}
+
+void UnidirectionalSequenceLstmTimeMajorSingleTimeTest(std::vector<armnn::BackendId>& backends)
+{
+    int32_t timeSize = 1;
+
+    std::vector<float> inputValues = { 1., 2., 3.,
+                                       4., 5., 6.,
+                                       7., 8., 9. };
+
+    std::vector<float> expectedOutputValues = { 0.13565768f, 0.12467254f, 0.02120903f, -0.05302038f,
+                                                0.1053334f, 0.08508634f, 0.00667238f, -0.00356043f,
+                                                0.05638668f, 0.02924093f, 0.00119751f, -0.00017249f };
+
+    UnidirectionalSequenceLstmTimeMajorTestImpl(backends,
+                                                timeSize,
+                                                inputValues,
+                                                expectedOutputValues);
 }
 
 void UnidirectionalSequenceLstmNoCifgWithPeepholeWithProjectionTest(std::vector<armnn::BackendId>& backends)
@@ -1411,6 +1441,12 @@
     UnidirectionalSequenceLstmTimeMajorTest(backends);
 }
 
+TEST_CASE ("UnidirectionalSequenceLstmTimeMajorSingleTimeTest_CpuRef_Test")
+{
+    std::vector <armnn::BackendId> backends = {armnn::Compute::CpuRef};
+    UnidirectionalSequenceLstmTimeMajorSingleTimeTest(backends);
+}
+
 TEST_CASE ("UnidirectionalSequenceLstmNoCifgWithPeepholeWithProjectionTest_CpuRef_Test")
 {
     std::vector <armnn::BackendId> backends = {armnn::Compute::CpuRef};
diff --git a/src/armnnTfLiteParser/TfLiteParser.cpp b/src/armnnTfLiteParser/TfLiteParser.cpp
index 301989e..052aac6 100644
--- a/src/armnnTfLiteParser/TfLiteParser.cpp
+++ b/src/armnnTfLiteParser/TfLiteParser.cpp
@@ -4049,7 +4049,7 @@
         desc.m_HiddenStateScale = hiddentensor->GetInfo().GetQuantizationScale();
         desc.m_HiddenStateZeroPoint = hiddentensor->GetInfo().GetQuantizationOffset();
     }
-    unsigned int batchSize  = inputTensorInfo.GetShape()[0];
+    unsigned int batchSize  = desc.m_TimeMajor ? inputTensorInfo.GetShape()[1] : inputTensorInfo.GetShape()[0];
     unsigned int outputSize = outputTensorInfo.GetShape()[2];
     unsigned int numUnits   = cellStateInInfo.GetShape()[1];
 
diff --git a/src/backends/backendsCommon/test/layerTests/UnidirectionalSequenceLstmTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/UnidirectionalSequenceLstmTestImpl.cpp
index 4a63d39..5381df5 100644
--- a/src/backends/backendsCommon/test/layerTests/UnidirectionalSequenceLstmTestImpl.cpp
+++ b/src/backends/backendsCommon/test/layerTests/UnidirectionalSequenceLstmTestImpl.cpp
@@ -224,7 +224,8 @@
     const armnn::TensorShape& outputExpectedShape,
     float qScale = 1.0f,
     int32_t qOffset = 0,
-    armnn::DataType constantDataType = armnn::DataType::Float32) {
+    armnn::DataType constantDataType = armnn::DataType::Float32)
+{
     IgnoreUnused(memoryManager);
     unsigned int batchSize = armnn::numeric_cast<unsigned int>(inputShape[0]);
     unsigned int timeSize = armnn::numeric_cast<unsigned int>(inputShape[1]);
@@ -413,7 +414,8 @@
     const armnn::TensorShape& outputExpectedShape,
     float qScale = 1.0f,
     int32_t qOffset = 0,
-    armnn::DataType constantDataType = armnn::DataType::Float32) {
+    armnn::DataType constantDataType = armnn::DataType::Float32)
+{
     IgnoreUnused(memoryManager);
     unsigned int batchSize = armnn::numeric_cast<unsigned int>(inputShape[1]);
     unsigned int timeSize = armnn::numeric_cast<unsigned int>(inputShape[0]);
@@ -613,7 +615,8 @@
 LayerTestResult<float, 3> UnidirectionalSequenceLstmLayerFloat32BatchMajorSingleBatchTest(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
-    const armnn::ITensorHandleFactory& tensorHandleFactory) {
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
     armnn::TensorInfo inputInfo({3, 1, 3}, armnn::DataType::Float32);
     std::vector<float> input = { 1., 2., 3., 4., 5., 4., 3., 2., 1. };
 
@@ -626,10 +629,32 @@
         input, expectedOutput, inputInfo.GetShape(), outputInfo.GetShape());
 }
 
+LayerTestResult<float, 3> UnidirectionalSequenceLstmLayerFloat32TimeMajorSingleTimeTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    armnn::TensorInfo inputInfo({ 1, 3, 3 }, armnn::DataType::Float32);
+    std::vector<float> input = { 1., 2., 3.,
+                                 4., 5., 6.,
+                                 7., 8., 9. };
+
+    armnn::TensorInfo outputInfo({ 1, 3, 4 }, armnn::DataType::Float32);
+    std::vector<float> expectedOutput =
+                          { 0.13565768f, 0.12467254f, 0.02120903f, -0.05302038f,
+                            0.1053334f, 0.08508634f, 0.00667238f, -0.00356043f,
+                            0.05638668f, 0.02924093f, 0.00119751f, -0.00017249f      };
+
+    return UnidirectionalSequenceLstmLayerFloat32TimeMajorTestImpl<armnn::DataType::Float32>(
+        workloadFactory, memoryManager, tensorHandleFactory,
+        input, expectedOutput, inputInfo.GetShape(), outputInfo.GetShape());
+}
+
 LayerTestResult<float, 3> UnidirectionalSequenceLstmLayerFloat32Test(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
-    const armnn::ITensorHandleFactory& tensorHandleFactory) {
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
     armnn::TensorInfo inputInfo({3, 2, 3}, armnn::DataType::Float32);
     std::vector<float> input = { 1., 2., 3., 4., 5., 4.,
                                  3., 2., 1., 2., 3., 4.,
@@ -650,7 +675,8 @@
 LayerTestResult<float, 3> UnidirectionalSequenceLstmLayerFloat32TimeMajorTest(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
-    const armnn::ITensorHandleFactory& tensorHandleFactory) {
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
     armnn::TensorInfo inputInfo({2, 3, 3}, armnn::DataType::Float32);
     std::vector<float> input = { 1., 2., 3., 4., 5., 4.,
                                  3., 2., 1., 2., 3., 4.,
diff --git a/src/backends/backendsCommon/test/layerTests/UnidirectionalSequenceLstmTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/UnidirectionalSequenceLstmTestImpl.hpp
index f303b28..5e64dcd 100644
--- a/src/backends/backendsCommon/test/layerTests/UnidirectionalSequenceLstmTestImpl.hpp
+++ b/src/backends/backendsCommon/test/layerTests/UnidirectionalSequenceLstmTestImpl.hpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2021, 2023 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
@@ -20,6 +20,11 @@
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
     const armnn::ITensorHandleFactory& tensorHandleFactory);
 
+LayerTestResult<float, 3> UnidirectionalSequenceLstmLayerFloat32TimeMajorSingleTimeTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
 LayerTestResult<float, 3> UnidirectionalSequenceLstmLayerFloat32Test(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
diff --git a/src/backends/cl/test/ClLayerTests.cpp b/src/backends/cl/test/ClLayerTests.cpp
index d8d451e..33e1b69 100644
--- a/src/backends/cl/test/ClLayerTests.cpp
+++ b/src/backends/cl/test/ClLayerTests.cpp
@@ -1346,6 +1346,8 @@
                               UnidirectionalSequenceLstmLayerFloat32TimeMajorSingleBatchTest)
 ARMNN_AUTO_TEST_CASE_WITH_THF(UnidirectionalSequenceLstmLayerFloat32BatchMajorSingleBatch,
                               UnidirectionalSequenceLstmLayerFloat32BatchMajorSingleBatchTest)
+ARMNN_AUTO_TEST_CASE_WITH_THF(UnidirectionalSequenceLstmLayerFloat32TimeMajorSingleTime,
+                              UnidirectionalSequenceLstmLayerFloat32TimeMajorSingleTimeTest)
 ARMNN_AUTO_TEST_CASE_WITH_THF(UnidirectionalSequenceLstmLayerFloat32,
                               UnidirectionalSequenceLstmLayerFloat32Test)
 ARMNN_AUTO_TEST_CASE_WITH_THF(UnidirectionalSequenceLstmLayerFloat32TimeMajor,
diff --git a/src/backends/neon/test/NeonLayerTests.cpp b/src/backends/neon/test/NeonLayerTests.cpp
index ae8352d..588c90b 100644
--- a/src/backends/neon/test/NeonLayerTests.cpp
+++ b/src/backends/neon/test/NeonLayerTests.cpp
@@ -1092,6 +1092,8 @@
                               UnidirectionalSequenceLstmLayerFloat32TimeMajorSingleBatchTest)
 ARMNN_AUTO_TEST_CASE_WITH_THF(UnidirectionalSequenceLstmLayerFloat32BatchMajorSingleBatch,
                               UnidirectionalSequenceLstmLayerFloat32BatchMajorSingleBatchTest)
+ARMNN_AUTO_TEST_CASE_WITH_THF(UnidirectionalSequenceLstmLayerFloat32TimeMajorSingleTime,
+                              UnidirectionalSequenceLstmLayerFloat32TimeMajorSingleTimeTest)
 ARMNN_AUTO_TEST_CASE_WITH_THF(UnidirectionalSequenceLstmLayerFloat32,
                               UnidirectionalSequenceLstmLayerFloat32Test)
 ARMNN_AUTO_TEST_CASE_WITH_THF(UnidirectionalSequenceLstmLayerFloat32TimeMajor,
diff --git a/src/backends/neon/workloads/NeonUnidirectionalSequenceLstmFloatWorkload.cpp b/src/backends/neon/workloads/NeonUnidirectionalSequenceLstmFloatWorkload.cpp
index e48425e..bbdcd1f 100644
--- a/src/backends/neon/workloads/NeonUnidirectionalSequenceLstmFloatWorkload.cpp
+++ b/src/backends/neon/workloads/NeonUnidirectionalSequenceLstmFloatWorkload.cpp
@@ -603,7 +603,8 @@
             statusSplit = arm_compute::NESplit::validate(&aclPermuteOutInfo,
                                                          splitterOutputsTensorInfosPtr,
                                                          aclAxisSplit);
-        } else
+        }
+        else
         {
             statusSplit = arm_compute::NESplit::validate(&aclInputInfo, splitterOutputsTensorInfosPtr, aclAxisSplit);
         }
@@ -740,7 +741,7 @@
         // Set input of LSTM to be first input ITensor.
         // Set output of LSTM to be final output ITensor.
         // LSTM input/output cannot be > 2 dimensions so need to resize its TensorInfo.
-        if (maxTime == 1 && !descriptor.m_TimeMajor)
+        if (maxTime == 1 && descriptor.m_TimeMajor)
         {
             TensorShape inputShape = GetTensorShape(aclInputInfo.tensor_shape(), 1U);
             TensorShape outputShape = GetTensorShape(aclOutputInfo.tensor_shape(), 1U);
diff --git a/src/backends/neon/workloads/NeonUnidirectionalSequenceLstmWorkload.cpp b/src/backends/neon/workloads/NeonUnidirectionalSequenceLstmWorkload.cpp
index 8a1747e..984a5dc 100644
--- a/src/backends/neon/workloads/NeonUnidirectionalSequenceLstmWorkload.cpp
+++ b/src/backends/neon/workloads/NeonUnidirectionalSequenceLstmWorkload.cpp
@@ -500,6 +500,12 @@
     TensorShape inputLayerShape = input.GetShape();
     TensorShape outputLayerShape = output.GetShape();
 
+    if (inputLayerShape.GetNumDimensions() != 3)
+    {
+        return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR,
+                                   "Unidirectional Sequence LSTM layer validate status failed.");
+    }
+
     unsigned int maxTime = descriptor.m_TimeMajor ? inputLayerShape[0] : inputLayerShape[1];
     unsigned int batchSize = descriptor.m_TimeMajor ? inputLayerShape[1] : inputLayerShape[0];
     unsigned int inputSize = inputLayerShape[2];
@@ -525,7 +531,7 @@
     //
     // Permute validate
     //
-    TensorInfo permuteOutInfo = TensorInfo(input);
+    TensorInfo permuteOutInfo = armnnUtils::Permuted(input, { 1U, 0U, 2U });
     arm_compute::TensorInfo aclPermuteOutInfo = armcomputetensorutils::BuildArmComputeTensorInfo(permuteOutInfo);
     if (!descriptor.m_TimeMajor)
     {
@@ -590,7 +596,17 @@
 
     arm_compute::LSTMParams<arm_compute::ITensorInfo> lstm_params_info;
 
-    const TensorInfo& scratchBuffer = TensorInfo(cellStateIn.GetShape(), input.GetDataType());
+    unsigned int numUnits = cellStateIn.GetShape()[1];
+    unsigned int scratchBufferFactor = 4;
+
+    if (descriptor.m_CifgEnabled)
+    {
+        // scratchBuffer = { batchSize, numUnits * 3 } with CIFG
+       scratchBufferFactor = 3;
+    }
+
+    const TensorInfo& scratchBuffer = TensorInfo({ batchSize, numUnits * scratchBufferFactor }, input.GetDataType());
+
 
     lstm_params_info.set_cell_clip_params(descriptor.m_ClippingThresCell);
     lstm_params_info.set_projection_clip_params(descriptor.m_ClippingThresProj);
@@ -707,7 +723,7 @@
         // Set input of LSTM to be first input ITensor.
         // Set output of LSTM to be final output ITensor.
         // LSTM input/output cannot be > 2 dimensions so need to resize its TensorInfo.
-        if (maxTime == 1 && !descriptor.m_TimeMajor)
+        if (maxTime == 1 && descriptor.m_TimeMajor)
         {
             TensorShape inputShape = GetTensorShape(aclInputInfo.tensor_shape(), 1U);
             TensorShape outputShape = GetTensorShape(aclOutputInfo.tensor_shape(), 1U);
diff --git a/src/backends/reference/test/RefLayerTests.cpp b/src/backends/reference/test/RefLayerTests.cpp
index 1f42397..a079bb7 100644
--- a/src/backends/reference/test/RefLayerTests.cpp
+++ b/src/backends/reference/test/RefLayerTests.cpp
@@ -2800,6 +2800,8 @@
                               UnidirectionalSequenceLstmLayerFloat32TimeMajorSingleBatchTest)
 ARMNN_AUTO_TEST_CASE_WITH_THF(UnidirectionalSequenceLstmLayerFloat32BatchMajorSingleBatch,
                               UnidirectionalSequenceLstmLayerFloat32BatchMajorSingleBatchTest)
+ARMNN_AUTO_TEST_CASE_WITH_THF(UnidirectionalSequenceLstmLayerFloat32TimeMajorSingleTime,
+                              UnidirectionalSequenceLstmLayerFloat32TimeMajorSingleTimeTest)
 ARMNN_AUTO_TEST_CASE_WITH_THF(UnidirectionalSequenceLstmLayerFloat32,
                               UnidirectionalSequenceLstmLayerFloat32Test)
 ARMNN_AUTO_TEST_CASE_WITH_THF(UnidirectionalSequenceLstmLayerFloat32TimeMajor,
diff --git a/src/backends/reference/workloads/RefUnidirectionalSequenceLstmWorkload.cpp b/src/backends/reference/workloads/RefUnidirectionalSequenceLstmWorkload.cpp
index c7a4b76..4ca3e03 100644
--- a/src/backends/reference/workloads/RefUnidirectionalSequenceLstmWorkload.cpp
+++ b/src/backends/reference/workloads/RefUnidirectionalSequenceLstmWorkload.cpp
@@ -81,6 +81,7 @@
         outputShape = armnnUtils::Permuted(outputInfo.GetShape(), mappings);
         outputInfo.SetShape(outputShape);
     }
+    // As it is permuted to time major, maxTime is inputShape[0].
     unsigned int maxTime = inputShape[0];
     unsigned int batchSize = inputShape[1];
     unsigned int outputSize = outputShape[2];