COMPMID-3576: Nightly failure: NEON/PoolingLayer/Float/FP16/MaxUnpooling S10

Extend NEPoolingLayer max pooling to extract indices for FP16

Signed-off-by: Sheri Zhang <sheri.zhang@arm.com>
Change-Id: I5a7c754be353e4c2c5d0ab3794e9427408d0c4fa
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3580
Reviewed-by: Sang-Hoon Park <sang-hoon.park@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/arm_compute/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
index 9c9b194..f3ea049 100644
--- a/arm_compute/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
@@ -55,7 +55,10 @@
      * @note Output shape must be equal to the shape of the original input to pool.
      *
      * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] indices   The indices of the maximal values. Data type supported: U32.
+     * @param[in]  indices   Tensor containing the offset to store the input elements in the output tensor.
+     *                       @ref NEPoolingLayerKernel with indices should precede this function in order to
+     *                       properly reconstruct the output tensor.
+     *                       The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
      * @param[out] output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      */
diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
index 15d63c7..2be2508 100644
--- a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
@@ -91,12 +91,6 @@
      * @param[in] window_input Input region on which to execute the kernel.
      * @param[in] window       Output region on which to execute the kernel.
      */
-    void pooling2_f32_nchw_maxpool_indices(const Window &window_input, const Window &window);
-    /** Function to perform 2x2 pooling and compute the pooling indices. The indices can be used for max unpool.
-     *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
-     */
     void pooling2_f32_nhwc_maxpool_indices(const Window &window_input, const Window &window);
     /** Function to perform MxN pooling for 32-bit floating point values.
      *
@@ -138,6 +132,19 @@
      * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
      */
     void pooling2_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform 2x2 pooling and compute the pooling indices for FP32/FP16. The indices can be used for max unpool.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    template <typename T>
+    void pooling2_nchw_maxpool_indices(const Window &window_input, const Window &window);
+    /** Function to perform 2x2 pooling and compute the pooling indices. The indices can be used for max unpool.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    void pooling2_f16_nhwc_maxpool_indices(const Window &window_input, const Window &window);
     /** Function to perform 3x3 pooling.
      *
      * @param[in] window_input    Input region on which to execute the kernel.
diff --git a/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h b/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h
index 468bc70..f13b4bd 100644
--- a/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h
@@ -45,7 +45,7 @@
     NEMaxUnpoolingLayer();
     /** Set the input and output tensors.
      *
-     * @note F16 is supported for pool sizes 2 and 3 only
+     * @note Only supported pool size 2
      *
      * @param[in, out] input     Source tensor. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[out]     output    Destination tensor. Data types supported: Same as @p input.
@@ -55,7 +55,7 @@
     void configure(ITensor *input, ITensor *indices, ITensor *output, const PoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref NEMaxUnpoolingLayer
      *
-     * @note F16 is supported for pool sizes 2 and 3 only
+     * @note Only supported pool size 2
      *
      * @param[in] input     Source tensor. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] indices   The indices of the maximal values. Data type supported: U32.