Integrate SME2 kernels

* Add SME/SME2 detection.
* Integrate SME2 implementation for:
  - Normal convolution
  - Winograd
  - Depthwise convolution
  - Pooling

Resolves: COMPMID-5700
Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Change-Id: I2f1ca1d05f8cfeee9309ed1c0a36096a4a6aad5c
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8692
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/Android.bp b/Android.bp
index 246cd59..e79d9b2 100644
--- a/Android.bp
+++ b/Android.bp
@@ -326,10 +326,12 @@
         "src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp",
         "src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp",
         "src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp",
+        "src/core/NEON/kernels/arm_gemm/interleave_indirect-sve.cpp",
         "src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp",
         "src/core/NEON/kernels/arm_gemm/mergeresults-fp16.cpp",
         "src/core/NEON/kernels/arm_gemm/mergeresults-sve.cpp",
         "src/core/NEON/kernels/arm_gemm/mergeresults.cpp",
+        "src/core/NEON/kernels/arm_gemm/misc-sve.cpp",
         "src/core/NEON/kernels/arm_gemm/misc.cpp",
         "src/core/NEON/kernels/arm_gemm/quantized.cpp",
         "src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp",
@@ -1004,6 +1006,37 @@
                 "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp",
                 "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp",
                 "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_2rows_dot_za/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_2rows_dot_za/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_2rows_dot_za/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp",
                 "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp",
                 "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
                 "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp",
@@ -1064,6 +1097,24 @@
                 "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp",
                 "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp",
                 "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst/generic.cpp",
                 "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
                 "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp",
                 "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
@@ -1155,6 +1206,26 @@
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/generic.cpp",
@@ -1206,8 +1277,10 @@
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp",
                 "src/core/NEON/kernels/convolution/winograd/input_transforms/a64_fp16_6x6.cpp",
                 "src/core/NEON/kernels/convolution/winograd/input_transforms/a64_fp32_6x6.cpp",
+                "src/core/NEON/kernels/convolution/winograd/input_transforms/sme_fp32_mla_6x6.cpp",
                 "src/core/NEON/kernels/convolution/winograd/input_transforms/sve_fp32_6x6.cpp",
                 "src/core/NEON/kernels/convolution/winograd/output_transforms/a64_fp16_4x4_3x3.cpp",
+                "src/core/NEON/kernels/convolution/winograd/output_transforms/sme_fp32_mopa_4x4_3x3.cpp",
                 "src/core/NEON/kernels/convolution/winograd/weight_transforms/a64_fp16_4x4_3x3.cpp",
                 
             ],