COMPMID-2754: Add support for QASYMM8_SIGNED in NE kernels/functions.

Kernels/Functions extended support:
- NEBatchToSpaceLayerKernel/NEBatchToSpaceLayer
- NEChannelShuffleLayerKernel/NEChannelShuffleLayer
- NECol2ImKernel/NECol2Im
- NEConvertFullyConnectedWeightsKernel/NEConvertFullyConnectedWeights
- NECopyKernel/NECopy
- NEConvolutionLayerReshapeWeights
- NEDepthToSpaceLayerKernel/NEDepthToSpaceLayer
- NEFlattenLayerKernel/NEFlattenLayer
- NEFillBorderKernel
- NEFullyConnectedLayerReshapeWeights
- NEGatherKernel/NEGather
- NEGEMMInterleave4x4Kernel
- NEGEMMTranspose1xWKernel
- NEIm2ColKernel/NEIm2Col
- NEMemsetKernel
- NEPadLayerKernel/NEPadLayer
- NEPermuteKernel/NEPermute
- NEReverseKernel/NEReverse
- NEReorgLayerKernel/NEReorgLayer
- NEReshapeLayerKernel/NEReshapeLayer
- NESplit
- NESlice
- NEStridedSliceKernel/NEStridedSlice
- NESpaceToBatchLayerKernel/NESpaceToBatchLayer
- NESpaceToDepthLayerKernel/NESpaceToDepthLayerKernel
- NEStackLayerKernel/NEStackLayer
- NETileKernel/NETile
- NETransposeKernel/NETranspose
- NEWidthConcatenateLayerKernel/NEHeightConcatenateLayer
- NEHeightConcatenateLayerKernel/NEHeightConcatenateLayer
- NEDepthConcatenateLayerKernel/NEDepthConcatenateLayer
- NEBathConcatenateLayerKernel/NEBatchConcatenateLayer

Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: Ia070332ad4c4dbced2541dc46f7f2f3a86833b65
Reviewed-on: https://review.mlplatform.org/c/2442
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp
index cfa98fb..3e250f5 100644
--- a/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp
@@ -66,6 +66,16 @@
         },
         input, output);
     }
+    else if(dt == DataType::QASYMM8_SIGNED && input_qinfo != output_qinfo)
+    {
+        execute_window_loop(window, [&](const Coordinates &)
+        {
+            const auto in_ptr  = reinterpret_cast<const int8_t *>(input_ptr + input.offset());
+            const auto out_ptr = reinterpret_cast<int8_t *>(output_ptr + output.offset());
+            vst1q_s8(out_ptr, vquantize_signed(vdequantize(vld1q_s8(in_ptr), input_qinfo), output_qinfo));
+        },
+        input, output);
+    }
     else
     {
         execute_window_loop(window, [&](const Coordinates &)
@@ -102,10 +112,7 @@
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
-                                                         DataType::U16, DataType::S16,
-                                                         DataType::U32, DataType::S32,
-                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX));
@@ -138,6 +145,7 @@
         case DataType::S8:
         case DataType::U8:
         case DataType::QASYMM8:
+        case DataType::QASYMM8_SIGNED:
             _func = &batch_concat<uint8_t>;
             break;
         case DataType::S16: