IVGCVSW-7964 Fix UnidirectionalSequenceLstm

 * Fix incorrect batch size and time size
 * Fix incorrect time major when max time =1
 * Fix incorrect permutation
 * Fix incorrect scratch buffer
 * Unit tests

Signed-off-by: Narumol Prangnawarat <narumol.prangnawarat@arm.com>
Change-Id: I510fae55528be412a58d020e82bd283852e7800b
diff --git a/src/backends/neon/workloads/NeonUnidirectionalSequenceLstmWorkload.cpp b/src/backends/neon/workloads/NeonUnidirectionalSequenceLstmWorkload.cpp
index 8a1747e..984a5dc 100644
--- a/src/backends/neon/workloads/NeonUnidirectionalSequenceLstmWorkload.cpp
+++ b/src/backends/neon/workloads/NeonUnidirectionalSequenceLstmWorkload.cpp
@@ -500,6 +500,12 @@
     TensorShape inputLayerShape = input.GetShape();
     TensorShape outputLayerShape = output.GetShape();
 
+    if (inputLayerShape.GetNumDimensions() != 3)
+    {
+        return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR,
+                                   "Unidirectional Sequence LSTM layer validate status failed.");
+    }
+
     unsigned int maxTime = descriptor.m_TimeMajor ? inputLayerShape[0] : inputLayerShape[1];
     unsigned int batchSize = descriptor.m_TimeMajor ? inputLayerShape[1] : inputLayerShape[0];
     unsigned int inputSize = inputLayerShape[2];
@@ -525,7 +531,7 @@
     //
     // Permute validate
     //
-    TensorInfo permuteOutInfo = TensorInfo(input);
+    TensorInfo permuteOutInfo = armnnUtils::Permuted(input, { 1U, 0U, 2U });
     arm_compute::TensorInfo aclPermuteOutInfo = armcomputetensorutils::BuildArmComputeTensorInfo(permuteOutInfo);
     if (!descriptor.m_TimeMajor)
     {
@@ -590,7 +596,17 @@
 
     arm_compute::LSTMParams<arm_compute::ITensorInfo> lstm_params_info;
 
-    const TensorInfo& scratchBuffer = TensorInfo(cellStateIn.GetShape(), input.GetDataType());
+    unsigned int numUnits = cellStateIn.GetShape()[1];
+    unsigned int scratchBufferFactor = 4;
+
+    if (descriptor.m_CifgEnabled)
+    {
+        // scratchBuffer = { batchSize, numUnits * 3 } with CIFG
+       scratchBufferFactor = 3;
+    }
+
+    const TensorInfo& scratchBuffer = TensorInfo({ batchSize, numUnits * scratchBufferFactor }, input.GetDataType());
+
 
     lstm_params_info.set_cell_clip_params(descriptor.m_ClippingThresCell);
     lstm_params_info.set_projection_clip_params(descriptor.m_ClippingThresProj);
@@ -707,7 +723,7 @@
         // Set input of LSTM to be first input ITensor.
         // Set output of LSTM to be final output ITensor.
         // LSTM input/output cannot be > 2 dimensions so need to resize its TensorInfo.
-        if (maxTime == 1 && !descriptor.m_TimeMajor)
+        if (maxTime == 1 && descriptor.m_TimeMajor)
         {
             TensorShape inputShape = GetTensorShape(aclInputInfo.tensor_shape(), 1U);
             TensorShape outputShape = GetTensorShape(aclOutputInfo.tensor_shape(), 1U);