Update Neon™ pooling kernel

- Reduce duplication and simplify overall structure.
- Improve multi-threaded performance by sharing more data
  in lower-level caches.

Partially Resolves: COMPMID-5054
Signed-off-by: Ramy Elgammal<ramy.elgammal@arm.com>
Change-Id: I5f4dc50913401d5c1cbfc10b866fae9490cbc4d7
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7404
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Andrew Mundy
Reviewed-by: Sheri Zhang <sheri.zhang@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_fp32.cpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_fp32.cpp
index 1905e1e..5ee0884 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_fp32.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_fp32.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,19 +45,6 @@
 namespace arm_conv {
 namespace pooling {
 
-namespace
-{
-  template <class Strategy>
-  bool is_supported(const PoolingArgs &args, const Nothing &)
-  {
-    return ((args.pool_type == Strategy::pooling_type()) &&
-            (args.pool_window.rows == Strategy::pool_rows()) &&
-            (args.pool_window.cols == Strategy::pool_cols()) &&
-            (args.pool_stride.rows == Strategy::stride_rows()) &&
-            (args.pool_stride.cols == Strategy::stride_cols()));
-  }
-}
-
 static const PoolingImplementation<float, float> pooling_fp32_methods[] = {
   {
     PoolingMethod::DEPTHFIRST,
@@ -67,7 +54,8 @@
     },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
-      return new PoolingDepthfirstGeneric<cpp_nhwc_1x1_stride_any_depthfirst<float>>(args);
+      auto strat = new cpp_nhwc_1x1_stride_any_depthfirst<float>(args.cpu_info);
+      return new PoolingDepthfirstGeneric<float, float, Nothing>(strat, args);
     },
   },
 #if defined(__aarch64__)
@@ -75,23 +63,27 @@
   {
     PoolingMethod::DEPTHFIRST,
     "sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst",
-    [] (const PoolingArgs &args, const Nothing &unused) -> bool {
-      return args.cpu_info->has_sve() && is_supported<sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst>(args, unused);
+    [] (const PoolingArgs &args, const Nothing &os) -> bool {
+      return args.cpu_info->has_sve() &&
+             is_supported<sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst>(args, os);
     },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
-      return new PoolingDepthfirst<sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst>(args);
+      auto strat = new sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<float>(strat, args);
     },
   },
   {
     PoolingMethod::DEPTHFIRST,
     "sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst",
-    [] (const PoolingArgs &args, const Nothing &unused) -> bool {
-      return args.cpu_info->has_sve() && is_supported<sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst>(args, unused);
+    [] (const PoolingArgs &args, const Nothing &os) -> bool {
+      return args.cpu_info->has_sve() &&
+             is_supported<sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst>(args, os);
     },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
-      return new PoolingDepthfirst<sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst>(args);
+      auto strat = new sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<float>(strat, args);
     },
   },
   {
@@ -102,7 +94,8 @@
     },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
-      return new PoolingDepthfirstGeneric<sve_fp32_nhwc_avg_generic_depthfirst>(args);
+      auto strat = new sve_fp32_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<float>(strat, args);
     },
   },
   {
@@ -113,7 +106,8 @@
     },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
-      return new PoolingDepthfirstGeneric<sve_fp32_nhwc_max_generic_depthfirst>(args);
+      auto strat = new sve_fp32_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<float>(strat, args);
     },
   },
 #endif  // defined(ARM_COMPUTE_ENABLE_SVE)
@@ -123,7 +117,8 @@
     is_supported<a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst>,
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
-      return new PoolingDepthfirst<a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst>(args);
+      auto strat = new a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<float>(strat, args);
     },
   },
   {
@@ -132,7 +127,8 @@
     is_supported<a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst>,
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
-      return new PoolingDepthfirst<a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst>(args);
+      auto strat = new a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<float>(strat, args);
     },
   },
   {
@@ -141,7 +137,8 @@
     [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::AVERAGE; },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
-      return new PoolingDepthfirstGeneric<a64_fp32_nhwc_avg_generic_depthfirst>(args);
+      auto strat = new a64_fp32_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<float>(strat, args);
     },
   },
   {
@@ -150,7 +147,8 @@
     [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::MAX; },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
-      return new PoolingDepthfirstGeneric<a64_fp32_nhwc_max_generic_depthfirst>(args);
+      auto strat = new a64_fp32_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<float>(strat, args);
     },
   },
 #endif  // defined(__aarch64__)