COMPMID-403: Add 7x7 NEON Pooling support.

Change-Id: I2f1e808884f215b9cf79e1f2015ef901e66b3e5f
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/78146
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h
index fb71261..201c5b5 100644
--- a/arm_compute/core/NEON/NEFixedPoint.h
+++ b/arm_compute/core/NEON/NEFixedPoint.h
@@ -681,6 +681,15 @@
  * @return The result of the 8bit power.
  */
 qint8x8_t vqpowq_qs8(qint8x8_t a, qint8x16_t b, int fixed_point_position);
+
+/** Compute lane-by-lane maximum between elements of a float vector with 4x2 elements
+ *
+ * @param[in] a Float input vector
+ * @param[in] b Float input vector
+ *
+ * @return The lane-by-lane maximum -> float32x4x2
+ */
+float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b);
 }
 #include "arm_compute/core/NEON/NEFixedPoint.inl"
 #endif /* __ARM_COMPUTE_NEFIXEDPOINT_H__ */
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
index 6db344d..b57fd3e 100644
--- a/arm_compute/core/NEON/NEFixedPoint.inl
+++ b/arm_compute/core/NEON/NEFixedPoint.inl
@@ -1015,4 +1015,16 @@
 {
     return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
 }
+
+inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
+{
+    float32x4x2_t res =
+    {
+        {
+            vmaxq_f32(a.val[0], b.val[0]),
+            vmaxq_f32(a.val[1], b.val[1])
+        }
+    };
+    return res;
+}
 }
diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
index 62a0878..bf06fdd 100644
--- a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
@@ -87,6 +87,13 @@
      */
     template <PoolingType pooling_type>
     void pooling3_q8(const Window &window_input, const Window &window);
+    /** Function to perform 7x7 pooling.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    template <PoolingType pooling_type>
+    void pooling7_f32(const Window &window_input, const Window &window);
     /** Common signature for all the specialised Pooling functions
      *
      * @param[in] window_input Input region on which to execute the kernel.