Update Neon™ depthwise kernel

- Reduce duplication and simplify overall structure.
- Improve multi-threaded performance by sharing more data
  in lower-level caches.

Partially Resolves: COMPMID-5054
Signed-off-by: Ramy Elgammal <ramy.elgammal@arm.com>
Change-Id: Iac747f39b21c540122fa75218762631c4d787911
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7449
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Andrew Mundy
Reviewed-by: Sheri Zhang <sheri.zhang@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp
index 46a3118..4ff249a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,15 +25,14 @@
 #include "arm_gemm_local.hpp"
 
 #include "depthwise_implementation.hpp"
-#include "depthwise_depthfirst_quantized.hpp"
-#include "depthwise_depthfirst_generic_quantized.hpp"
-#include "depthwise_depthfirst_multiplier_quantized.hpp"
-#include "depthwise_depthfirst_generic_multiplier_quantized.hpp"
+#include "depthwise_depthfirst.hpp"
+#include "depthwise_depthfirst_generic.hpp"
+#include "depthwise_depthfirst_multiplier.hpp"
 
 #include "depthwise_implementation_constraints.hpp"
 
 #if defined(__aarch64__)
-#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 #include "kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
 #include "kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
 #include "kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
@@ -41,7 +40,7 @@
 #include "kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
 #include "kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
 #include "kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
-#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
 #include "kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
 #include "kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
 #include "kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
@@ -73,7 +72,7 @@
 
 static const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> depthwise_s8q_methods[] = {
 #if defined(__aarch64__)
-#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
   {
     DepthwiseMethod::DEPTHFIRST,
     "sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst",
@@ -84,7 +83,8 @@
                              cpu_has_sve2),
     nullptr,
     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
-      return new DepthwiseDepthfirstQuantized<sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst>(args, qp);
+      auto strat = new sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
     },
   },
   {
@@ -96,7 +96,8 @@
                              cpu_has_sve2),
     nullptr,
     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
-      return new DepthwiseDepthfirstQuantized<sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst>(args, qp);
+      auto strat = new sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
     },
   },
   {
@@ -108,7 +109,8 @@
                              cpu_has_sve2),
     nullptr,
     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
-      return new DepthwiseDepthfirstQuantized<sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst>(args, qp);
+      auto strat = new sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
     },
   },
   {
@@ -120,7 +122,8 @@
                              cpu_has_sve2),
     nullptr,
     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
-      return new DepthwiseDepthfirstQuantized<sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst>(args, qp);
+      auto strat = new sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
     },
   },
   {
@@ -132,7 +135,8 @@
                              cpu_has_sve2),
     nullptr,
     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
-      return new DepthwiseDepthfirstQuantized<sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst>(args, qp);
+      auto strat = new sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
     },
   },
   {
@@ -140,10 +144,12 @@
     "sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
     constraint<Requantize32>(is_supported<sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
                              qp_has_no_left_shift,
+                             has_channel_multiplier,
                              cpu_has_sve2),
     nullptr,
     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
-      return new DepthwiseDepthfirstWithMultiplierQuantized<sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>(args, qp);
+      auto strat = new sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
     },
   },
   {
@@ -151,13 +157,15 @@
     "sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
     constraint<Requantize32>(is_supported<sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
                              qp_has_no_left_shift,
+                             has_channel_multiplier,
                              cpu_has_sve2),
     nullptr,
     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
-      return new DepthwiseDepthfirstWithMultiplierQuantized<sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>(args, qp);
+      auto strat = new sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
     },
   },
-#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
   {
     DepthwiseMethod::DEPTHFIRST,
     "a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst",
@@ -168,7 +176,8 @@
                              cpu_has_dot_product),
     nullptr,
     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
-      return new DepthwiseDepthfirstQuantized<a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst>(args, qp);
+      auto strat = new a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
     },
   },
   {
@@ -180,7 +189,8 @@
                              cpu_has_dot_product),
     nullptr,
     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
-      return new DepthwiseDepthfirstQuantized<a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst>(args, qp);
+      auto strat = new a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
     },
   },
   {
@@ -191,7 +201,8 @@
                              qp_has_no_left_shift),
     nullptr,
     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
-      return new DepthwiseDepthfirstQuantized<a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst>(args, qp);
+      auto strat = new a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
     },
   },
   {
@@ -202,7 +213,8 @@
                              qp_has_no_left_shift),
     nullptr,
     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
-      return new DepthwiseDepthfirstQuantized<a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst>(args, qp);
+      auto strat = new a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
     },
   },
   {
@@ -213,7 +225,8 @@
                              qp_has_no_left_shift),
     nullptr,
     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
-      return new DepthwiseDepthfirstQuantized<a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst>(args, qp);
+      auto strat = new a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
     },
   },
   {
@@ -222,7 +235,9 @@
     constraint<Requantize32>(has_no_channel_multiplier),
     nullptr,
     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
-      return new DepthwiseDepthfirstGenericQuantized<a64_s8q_nhwc_generic_output9_mla_depthfirst, 3, 3>(args, qp);
+      auto kernel = new a64_s8q_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
+      auto strat = new GenericDepthfirstStrategy<int8_t>(kernel, 3, 3, args);
+      return new DepthwiseDepthfirstGeneric<int8_t>(strat, args, qp);
     },
   },
   {
@@ -230,10 +245,12 @@
     "a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
     constraint<Requantize32>(is_supported<a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
                              qp_has_no_left_shift,
+                             has_channel_multiplier,
                              cpu_has_dot_product),
     nullptr,
     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
-      return new DepthwiseDepthfirstWithMultiplierQuantized<a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>(args, qp);
+      auto strat = new a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
     },
   },
   {
@@ -241,19 +258,23 @@
     "a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
     constraint<Requantize32>(is_supported<a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
                              qp_has_no_left_shift,
+                             has_channel_multiplier,
                              cpu_has_dot_product),
     nullptr,
     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
-      return new DepthwiseDepthfirstWithMultiplierQuantized<a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>(args, qp);
+      auto strat = new a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
     },
   },
   {
     DepthwiseMethod::DEPTHFIRST,
     "a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
-    nullptr,
+    constraint<Requantize32>(has_channel_multiplier),
     nullptr,
     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
-      return new DepthwiseDepthfirstGenericWithMultiplierQuantized<a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst>(args, qp);
+      auto kern = new a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
+      auto strat = new GenericDepthfirstMultiplierStrategy<int8_t>(kern, args);
+      return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, true>(strat, args, qp);
     },
   },
 #endif  // defined(__aarch64__)