Integrate improved pooling layer on NEON

Resolves COMPMID-4035

Change-Id: I559f8c4208fba9193dfe5012f03ddaf26c746215
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4855
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/Android.bp b/Android.bp
index 4427bd4..185097d 100644
--- a/Android.bp
+++ b/Android.bp
@@ -326,6 +326,13 @@
         "src/core/NEON/kernels/NEWarpKernel.cpp",
         "src/core/NEON/kernels/NEWeightsReshapeKernel.cpp",
         "src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp",
+        "src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp",
+        "src/core/NEON/kernels/arm_conv/pooling/pooling_fp16.cpp",
+        "src/core/NEON/kernels/arm_conv/pooling/pooling_fp32.cpp",
+        "src/core/NEON/kernels/arm_conv/pooling/pooling_s8.cpp",
+        "src/core/NEON/kernels/arm_conv/pooling/pooling_s8q.cpp",
+        "src/core/NEON/kernels/arm_conv/pooling/pooling_u8.cpp",
+        "src/core/NEON/kernels/arm_conv/pooling/pooling_u8q.cpp",
         "src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp",
         "src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp",
         "src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp",
@@ -341,6 +348,7 @@
         "src/core/NEON/kernels/arm_gemm/quantized.cpp",
         "src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp",
         "src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp",
+        "src/core/NEON/kernels/assembly/NEPoolingAssemblyWrapperKernel.cpp",
         "src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp",
         "src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp",
         "src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp",
@@ -723,6 +731,7 @@
         "src/runtime/NEON/functions/NEPermute.cpp",
         "src/runtime/NEON/functions/NEPhase.cpp",
         "src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp",
+        "src/runtime/NEON/functions/NEPoolingAssemblyDispatch.cpp",
         "src/runtime/NEON/functions/NEPoolingLayer.cpp",
         "src/runtime/NEON/functions/NEPriorBoxLayer.cpp",
         "src/runtime/NEON/functions/NEQLSTMLayer.cpp",
@@ -798,6 +807,42 @@
         },
         arm64: {
             srcs: [
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp",