COMPMID-2601 [CL] add mixed precision support to PoolingLayer

* PoolingLayerInfo is updated with a new flag.
* CL Kernel is updated to use FP32 accumulation.
* CL pooling layer testscases are added for mixed precision.
* Reference pooling layer is updated to use FP32 accumulation.

Change-Id: I4ab2167cc7f86c86293cf50a0ca5119c04dc9c7e
Signed-off-by: Sang-Hoon Park <sang-hoon.park@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1973
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: VidhyaSudhan Loganathan <vidhyasudhan.loganathan@arm.com>
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 7f60638..9641089 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -1195,39 +1195,44 @@
 public:
     /** Default Constructor */
     PoolingLayerInfo()
-        : _pool_type(PoolingType::MAX), _pool_size(Size2D()), _pad_stride_info(PadStrideInfo()), _exclude_padding(false), _is_global_pooling(false)
+        : _pool_type(PoolingType::MAX), _pool_size(Size2D()), _pad_stride_info(PadStrideInfo()), _exclude_padding(false), _is_global_pooling(false), _fp_mixed_precision(false)
     {
     }
     /** Default Constructor
      *
-     * @param[in] pool_type       Pooling type @ref PoolingType.
-     * @param[in] pool_size       Pooling size, in elements, across  x and y.
-     * @param[in] pad_stride_info (Optional) Padding and stride information @ref PadStrideInfo
-     * @param[in] exclude_padding (Optional) Strategy when accounting padding in calculations.
-     *                             True will exclude padding while false will not (Used in AVG/L2 pooling to determine the pooling area).
-     *                             Defaults to false;
+     * @param[in] pool_type          Pooling type @ref PoolingType.
+     * @param[in] pool_size          Pooling size, in elements, across  x and y.
+     * @param[in] pad_stride_info    (Optional) Padding and stride information @ref PadStrideInfo
+     * @param[in] exclude_padding    (Optional) Strategy when accounting padding in calculations.
+     *                               True will exclude padding while false will not (Used in AVG/L2 pooling to determine the pooling area).
+     *                               Defaults to false;
+     * @param[in] fp_mixed_precision (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy.
      */
     explicit PoolingLayerInfo(PoolingType   pool_type,
                               unsigned int  pool_size,
-                              PadStrideInfo pad_stride_info = PadStrideInfo(),
-                              bool          exclude_padding = false)
-        : _pool_type(pool_type), _pool_size(Size2D(pool_size, pool_size)), _pad_stride_info(pad_stride_info), _exclude_padding(exclude_padding), _is_global_pooling(false)
+                              PadStrideInfo pad_stride_info    = PadStrideInfo(),
+                              bool          exclude_padding    = false,
+                              bool          fp_mixed_precision = false)
+        : _pool_type(pool_type), _pool_size(Size2D(pool_size, pool_size)), _pad_stride_info(pad_stride_info), _exclude_padding(exclude_padding), _is_global_pooling(false),
+          _fp_mixed_precision(fp_mixed_precision)
     {
     }
     /** Default Constructor
      *
-     * @param[in] pool_type       Pooling type @ref PoolingType.
-     * @param[in] pool_size       Pooling size, in elements, across  x and y.
-     * @param[in] pad_stride_info (Optional) Padding and stride information @ref PadStrideInfo
-     * @param[in] exclude_padding (Optional) Strategy when accounting padding in calculations.
-     *                             True will exclude padding while false will not (Used in AVG/L2 pooling to determine the pooling area).
-     *                             Defaults to false;
+     * @param[in] pool_type          Pooling type @ref PoolingType.
+     * @param[in] pool_size          Pooling size, in elements, across  x and y.
+     * @param[in] pad_stride_info    (Optional) Padding and stride information @ref PadStrideInfo
+     * @param[in] exclude_padding    (Optional) Strategy when accounting padding in calculations.
+     *                               True will exclude padding while false will not (Used in AVG/L2 pooling to determine the pooling area).
+     *                               Defaults to false;
+     * @param[in] fp_mixed_precision (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy.
      */
     explicit PoolingLayerInfo(PoolingType   pool_type,
                               Size2D        pool_size,
-                              PadStrideInfo pad_stride_info = PadStrideInfo(),
-                              bool          exclude_padding = false)
-        : _pool_type(pool_type), _pool_size(pool_size), _pad_stride_info(pad_stride_info), _exclude_padding(exclude_padding), _is_global_pooling(false)
+                              PadStrideInfo pad_stride_info    = PadStrideInfo(),
+                              bool          exclude_padding    = false,
+                              bool          fp_mixed_precision = false)
+        : _pool_type(pool_type), _pool_size(pool_size), _pad_stride_info(pad_stride_info), _exclude_padding(exclude_padding), _is_global_pooling(false), _fp_mixed_precision(fp_mixed_precision)
     {
     }
     /** Default Constructor
@@ -1237,7 +1242,7 @@
      * @param[in] pool_type Pooling type @ref PoolingType.
      */
     explicit PoolingLayerInfo(PoolingType pool_type)
-        : _pool_type(pool_type), _pool_size(Size2D()), _pad_stride_info(PadStrideInfo(1, 1, 0, 0)), _exclude_padding(false), _is_global_pooling(true)
+        : _pool_type(pool_type), _pool_size(Size2D()), _pad_stride_info(PadStrideInfo(1, 1, 0, 0)), _exclude_padding(false), _is_global_pooling(true), _fp_mixed_precision(false)
     {
     }
     /** Get the pooling type */
@@ -1260,6 +1265,11 @@
     {
         return _exclude_padding;
     }
+    /** Check if a wider accumulator should be used. */
+    bool fp_mixed_precision() const
+    {
+        return _fp_mixed_precision;
+    }
     /** Check if is global pooling */
     bool is_global_pooling() const
     {
@@ -1272,6 +1282,7 @@
     PadStrideInfo _pad_stride_info;
     bool          _exclude_padding;
     bool          _is_global_pooling;
+    bool          _fp_mixed_precision;
 };
 
 /** ROI Pooling Layer Information class */
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index 6b2da0b..c8b5e07 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl
@@ -38,7 +38,7 @@
 #define DIV_OP(x, y) (x * (1.f / y))
 #define SQRT_OP(x) sqrt((x))
 
-#define DIV_OP_NHWC(x, y) (x * (VEC_DATA_TYPE(float, 8))(1.f / y))
+#define DIV_OP_NHWC(x, y) (x * (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(1.f / y))
 
 #if STRIDE_X == 1
 #define POOLING3x3(res, input, output) POOLING3x3_STRIDE1(res, input, output)
@@ -48,121 +48,129 @@
 #define POOLING3x3(res, input, output) POOLING3x3_STRIDE3(res, input, output)
 #endif /* STRIDE_X == 3 */
 
-#define POOLING3x3_STRIDE1(res, input, output)                                                                                               \
+#if defined(FP_MIXED_PRECISION)
+#define CONVERT_TO_ACC_DATA_TYPE(x, n) CONVERT(x, VEC_DATA_TYPE(ACC_DATA_TYPE, n))
+#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) \
+    CONVERT_TO_ACC_DATA_TYPE(vload##n(offset, ptr), n)
+#else /* defined(FP_MIXED_PRECISION) */
+#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) vload##n(offset, ptr)
+#endif /* defined(FP_MIXED_PRECISION) */
+
+#define POOLING3x3_STRIDE1(res, input, output)                                                                                                       \
+    ({                                                                                                                                               \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                                   \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 2)                                                                                                              \
+        data01 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 4);                               \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                                   \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 2)                                                                                                              \
+        data11 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 4);                               \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                                   \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 2)                                                                                                              \
+        data21 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 4);                               \
+        data00 = POW2_OP(data00, 4);                                                                                                                 \
+        data01 = POW2_OP(data01, 2);                                                                                                                 \
+        data10 = POW2_OP(data10, 4);                                                                                                                 \
+        data11 = POW2_OP(data11, 2);                                                                                                                 \
+        data20 = POW2_OP(data20, 4);                                                                                                                 \
+        data21 = POW2_OP(data21, 2);                                                                                                                 \
+        \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        values00 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data00.s01212323);                                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        values01 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data01.s0, data00.s3, data01.s01);                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        values10 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data10.s01212323);                                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        values11 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data11.s0, data10.s3, data11.s01);                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        values20 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data20.s01212323);                                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        values21 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data21.s0, data20.s3, data21.s01);                                                              \
+        \
+        values00 = POOL_OP(values00, values10);                                                                                                      \
+        values01 = POOL_OP(values01, values11);                                                                                                      \
+        values00 = POOL_OP(values00, values20);                                                                                                      \
+        values01 = POOL_OP(values01, values21);                                                                                                      \
+        \
+        res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s147, values01.s2)); \
+        res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s25, values01.s03));                                                           \
+    })
+
+#define POOLING3x3_STRIDE2(res, input, output)                                                                                                       \
+    ({                                                                                                                                               \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        data00               = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                     \
+        ACC_DATA_TYPE data01 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8));                                       \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        data10               = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                     \
+        ACC_DATA_TYPE data11 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8));                                       \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        data20               = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                     \
+        ACC_DATA_TYPE data21 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8));                                       \
+        data00               = POW2_OP(data00, 8);                                                                                                   \
+        data01               = POW2_OP(data01, 1);                                                                                                   \
+        data10               = POW2_OP(data10, 8);                                                                                                   \
+        data11               = POW2_OP(data11, 1);                                                                                                   \
+        data20               = POW2_OP(data20, 8);                                                                                                   \
+        data21               = POW2_OP(data21, 1);                                                                                                   \
+        \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        values00 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data00.s01223445);                                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        values01 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s667, data01);                                                                           \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        values10 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data10.s01223445);                                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        values11 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data10.s667, data11);                                                                           \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        values20 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data20.s01223445);                                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        values21 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data20.s667, data21);                                                                           \
+        \
+        values00 = POOL_OP(values00, values10);                                                                                                      \
+        values01 = POOL_OP(values01, values11);                                                                                                      \
+        values00 = POOL_OP(values00, values20);                                                                                                      \
+        values01 = POOL_OP(values01, values21);                                                                                                      \
+        \
+        res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s147, values01.s2)); \
+        res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s25, values01.s03));                                                           \
+    })
+
+#define POOLING3x3_STRIDE3(res, input, output)                                                                                               \
     ({                                                                                                                                       \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                          \
-        data00 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 2)                                                                                                          \
-        data01 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 4);                                                      \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                          \
-        data10 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 2)                                                                                                          \
-        data11 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 4);                                                      \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                          \
-        data20 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 2)                                                                                                          \
-        data21 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 4);                                                      \
-        data00 = POW2_OP(data00, 4);                                                                                                         \
-        data01 = POW2_OP(data01, 2);                                                                                                         \
-        data10 = POW2_OP(data10, 4);                                                                                                         \
-        data11 = POW2_OP(data11, 2);                                                                                                         \
-        data20 = POW2_OP(data20, 4);                                                                                                         \
-        data21 = POW2_OP(data21, 2);                                                                                                         \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                      \
+        data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                           \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                      \
+        data01 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8);                       \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                      \
+        data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                           \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                      \
+        data11 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8);                       \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                      \
+        data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                           \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                      \
+        data21 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8);                       \
+        data00 = POW2_OP(data00, 8);                                                                                                         \
+        data01 = POW2_OP(data01, 4);                                                                                                         \
+        data10 = POW2_OP(data10, 8);                                                                                                         \
+        data11 = POW2_OP(data11, 4);                                                                                                         \
+        data20 = POW2_OP(data20, 8);                                                                                                         \
+        data21 = POW2_OP(data21, 4);                                                                                                         \
         \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                          \
-        values00 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data00.s01212323);                                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                          \
-        values01 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data01.s0, data00.s3, data01.s01);                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                          \
-        values10 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data10.s01212323);                                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                          \
-        values11 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data11.s0, data10.s3, data11.s01);                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                          \
-        values20 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data20.s01212323);                                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                          \
-        values21 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data21.s0, data20.s3, data21.s01);                                                          \
+        data00 = POOL_OP(data00, data10);                                                                                                    \
+        data01 = POOL_OP(data01, data11);                                                                                                    \
+        data00 = POOL_OP(data00, data20);                                                                                                    \
+        data01 = POOL_OP(data01, data21);                                                                                                    \
         \
-        values00 = POOL_OP(values00, values10);                                                                                              \
-        values01 = POOL_OP(values01, values11);                                                                                              \
-        values00 = POOL_OP(values00, values20);                                                                                              \
-        values01 = POOL_OP(values01, values21);                                                                                              \
-        \
-        res = POOL_OP((VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s147, values01.s2)); \
-        res = POOL_OP(res, (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s25, values01.s03));                                                       \
+        res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s036, data01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s147, data01.s2)); \
+        res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s25, data01.s03));                                                       \
     })
 
-#define POOLING3x3_STRIDE2(res, input, output)                                                                                               \
-    ({                                                                                                                                       \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                          \
-        data00           = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                                                \
-        DATA_TYPE data01 = *((__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8);                                                    \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                          \
-        data10           = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                                                \
-        DATA_TYPE data11 = *((__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8);                                                    \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                          \
-        data20           = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                                                \
-        DATA_TYPE data21 = *((__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8);                                                    \
-        data00           = POW2_OP(data00, 8);                                                                                               \
-        data01           = POW2_OP(data01, 1);                                                                                               \
-        data10           = POW2_OP(data10, 8);                                                                                               \
-        data11           = POW2_OP(data11, 1);                                                                                               \
-        data20           = POW2_OP(data20, 8);                                                                                               \
-        data21           = POW2_OP(data21, 1);                                                                                               \
-        \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                          \
-        values00 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data00.s01223445);                                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                          \
-        values01 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s667, data01);                                                                       \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                          \
-        values10 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data10.s01223445);                                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                          \
-        values11 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data10.s667, data11);                                                                       \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                          \
-        values20 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data20.s01223445);                                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                          \
-        values21 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data20.s667, data21);                                                                       \
-        \
-        values00 = POOL_OP(values00, values10);                                                                                              \
-        values01 = POOL_OP(values01, values11);                                                                                              \
-        values00 = POOL_OP(values00, values20);                                                                                              \
-        values01 = POOL_OP(values01, values21);                                                                                              \
-        \
-        res = POOL_OP((VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s147, values01.s2)); \
-        res = POOL_OP(res, (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s25, values01.s03));                                                       \
-    })
-
-#define POOLING3x3_STRIDE3(res, input, output)                                                                                       \
-    ({                                                                                                                               \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                  \
-        data00 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                                                  \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                  \
-        data01 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8);                                              \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                  \
-        data10 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                                                  \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                  \
-        data11 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8);                                              \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                  \
-        data20 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                                                  \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                  \
-        data21 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8);                                              \
-        data00 = POW2_OP(data00, 8);                                                                                                 \
-        data01 = POW2_OP(data01, 4);                                                                                                 \
-        data10 = POW2_OP(data10, 8);                                                                                                 \
-        data11 = POW2_OP(data11, 4);                                                                                                 \
-        data20 = POW2_OP(data20, 8);                                                                                                 \
-        data21 = POW2_OP(data21, 4);                                                                                                 \
-        \
-        data00 = POOL_OP(data00, data10);                                                                                            \
-        data01 = POOL_OP(data01, data11);                                                                                            \
-        data00 = POOL_OP(data00, data20);                                                                                            \
-        data01 = POOL_OP(data01, data21);                                                                                            \
-        \
-        res = POOL_OP((VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s036, data01.s1), (VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s147, data01.s2)); \
-        res = POOL_OP(res, (VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s25, data01.s03));                                                   \
-    })
-
-DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
-                              const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
+                                  const int pad_x, const int pad_y, const int stride_x, const int stride_y)
 {
     int       start_x = get_global_id(0) * stride_x - pad_x;
     int       start_y = get_global_id(1) * stride_y - pad_y;
@@ -210,10 +218,10 @@
     Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
 
     // Load data
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data0 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
+    data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
+    data1 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
 
 #if defined(POOL_L2)
     // Raise to power of 2 for L2 Pooling
@@ -222,8 +230,8 @@
 #endif /* defined(POOL_L2) */
 
     // Perform calculations
-    data0         = POOL_OP(data0, data1);
-    DATA_TYPE res = POOL_OP(data0.s0, data0.s1);
+    data0             = POOL_OP(data0, data1);
+    ACC_DATA_TYPE res = POOL_OP(data0.s0, data0.s1);
 
 #if defined(POOL_AVG) || defined(POOL_L2)
     // Divide by pool region in case of average or l2 pooling
@@ -236,7 +244,7 @@
 #endif /* defined(POOL_L2) */
 
     // Store result
-    *(__global DATA_TYPE *)output.ptr = res;
+    *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
 }
 
 /** Performs a pooling function of pool size equal to 3
@@ -274,12 +282,12 @@
     Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
 
     // Load data
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    data0 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    data1 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    data2 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
+    data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
+    data1 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
+    data2 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
 
 #if defined(POOL_L2)
     // Raise to power of 2 for L2 Pooling
@@ -289,9 +297,9 @@
 #endif /* defined(POOL_L2) */
 
     // Perform calculations
-    data0         = POOL_OP(data0, data1);
-    data0         = POOL_OP(data0, data2);
-    DATA_TYPE res = POOL_OP(POOL_OP(data0.s0, data0.s1), data0.s2);
+    data0             = POOL_OP(data0, data1);
+    data0             = POOL_OP(data0, data2);
+    ACC_DATA_TYPE res = POOL_OP(POOL_OP(data0.s0, data0.s1), data0.s2);
 
 #if defined(POOL_AVG) || defined(POOL_L2)
     // Divide by pool region in case of average pooling
@@ -304,7 +312,7 @@
 #endif /* defined(POOL_L2) */
 
     // Store result
-    *(__global DATA_TYPE *)output.ptr = res;
+    *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
 }
 
 #if defined(POOLING3x3)
@@ -312,7 +320,7 @@
 #define CONVERT_OP(data_type) convert_##data_type##4
 #define CONVERT_VECTOR4(data_type) CONVERT_OP(data_type)
 
-VEC_DATA_TYPE(DATA_TYPE, 4)
+VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
 calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upper_bound_h,
                      const int pad_x, const int pad_y, const int stride_x, const int stride_y)
 {
@@ -324,7 +332,7 @@
     start_x = max((int4)0, start_x);
     start_y = max(0, start_y);
 #endif /* defined(EXCLUDE_PADDING) */
-    return (VEC_DATA_TYPE(DATA_TYPE, 4))(1.f) / CONVERT_VECTOR4(DATA_TYPE)(((int4)(end_y - start_y)) * (end_x - start_x));
+    return (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(1.f) / CONVERT_VECTOR4(ACC_DATA_TYPE)(((int4)(end_y - start_y)) * (end_x - start_x));
 }
 
 /** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3
@@ -361,7 +369,7 @@
     Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
     Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
 
-    VEC_DATA_TYPE(DATA_TYPE, 4)
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
     res;
 
     // Perform pooling 3x3 for 4 output elements
@@ -377,7 +385,7 @@
     res = SQRT_OP(res);
 #endif /* defined(POOL_L2) */
 
-    vstore4(res, 0, (__global DATA_TYPE *)output.ptr);
+    vstore4(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 4)), 0, (__global DATA_TYPE *)output.ptr);
 }
 #endif // defined(POOLING3x3)
 
@@ -431,9 +439,9 @@
     Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
     Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
 
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    vdata           = INITIAL_VALUE;
-    DATA_TYPE sdata = INITIAL_VALUE;
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+    vdata               = INITIAL_VALUE;
+    ACC_DATA_TYPE sdata = INITIAL_VALUE;
 
     // Load data
     for(int y = 0; y < POOL_SIZE_Y; y++)
@@ -441,8 +449,8 @@
         int x = 0;
         for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
         {
-            VEC_DATA_TYPE(DATA_TYPE, 8)
-            data0 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+            VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+            data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
 #if defined(POOL_L2)
             // Raise to power of 2 for L2 Pooling
             data0 *= data0;
@@ -453,7 +461,7 @@
         // Leftover
         for(; x < (int)POOL_SIZE_X; ++x)
         {
-            DATA_TYPE data0 = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+            ACC_DATA_TYPE data0 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)));
 #if defined(POOL_L2)
             // Raise to power of 2 for L2 Pooling
             data0 *= data0;
@@ -463,12 +471,12 @@
     }
 
     // Reduce result
-    VEC_DATA_TYPE(DATA_TYPE, 4)
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
     reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    reduce2       = POOL_OP(reduce4.s01, reduce4.s23);
-    DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
-    res           = POOL_OP(res, sdata);
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
+    reduce2           = POOL_OP(reduce4.s01, reduce4.s23);
+    ACC_DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
+    res               = POOL_OP(res, sdata);
 
 #if defined(POOL_AVG) || defined(POOL_L2)
     // Divide by pool region in case of average pooling
@@ -481,12 +489,12 @@
 #endif /* defined(POOL_L2) */
 
     // Store result
-    *(__global DATA_TYPE *)output.ptr = res;
+    *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
 }
 #endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
 
-float calculate_avg_scale_nhwc(const int pool_size_x, const int pool_size_y, int upper_bound_w, int upper_bound_h,
-                               const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+ACC_DATA_TYPE calculate_avg_scale_nhwc(const int pool_size_x, const int pool_size_y, int upper_bound_w, int upper_bound_h,
+                                       const int pad_x, const int pad_y, const int stride_x, const int stride_y)
 {
     int start_x = get_global_id(1) * stride_x - pad_x;
 #if defined(DST_DEPTH)
@@ -553,7 +561,7 @@
     Tensor3D  output     = CONVERT_TO_TENSOR3D_STRUCT(output);
 #endif /* defined(DST_DEPTH) */
 
-    VEC_DATA_TYPE(float, 8)
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
     vdata = INITIAL_VALUE;
 
     const int idx_width = get_global_id(1) * STRIDE_X;
@@ -572,18 +580,18 @@
             x1     = select(x1, PAD_X - idx_width - 1, y != y1);
 
 #if defined(DST_DEPTH)
-            VEC_DATA_TYPE(DATA_TYPE, 8)
-            data0 = vload8(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y, 0));
+            VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+            data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y, 0));
 #else  /* defined(DST_DEPTH) */
-            VEC_DATA_TYPE(DATA_TYPE, 8)
-            data0 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y));
+            VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+            data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y));
 #endif /* defined(DST_DEPTH) */
 
 #if defined(POOL_L2)
             // Raise to power of 2 for L2 Pooling
             data0 *= data0;
 #endif /* defined(POOL_L2) */
-            vdata = POOL_OP(vdata, CONVERT(data0, float8));
+            vdata = POOL_OP(vdata, CONVERT(data0, VEC_DATA_TYPE(ACC_DATA_TYPE, 8)));
         }
     }
 
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index 8eaf5bf..8e69157 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -236,6 +236,12 @@
 
     build_opts.add_option_if(data_type == DataType::F16, "-DFP16");
 
+    const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision();
+    const auto use_wider_accumulator  = use_fp_mixed_precision && (pool_type != PoolingType::MAX);
+    const auto acc_data_type          = get_cl_type_from_data_type(use_wider_accumulator ? DataType::F32 : data_type);
+    build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type);
+    build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION");
+
     // Create kernel
     switch(data_layout)
     {
diff --git a/tests/validation/CL/PoolingLayer.cpp b/tests/validation/CL/PoolingLayer.cpp
index 7d79f3f..ff7c24f 100644
--- a/tests/validation/CL/PoolingLayer.cpp
+++ b/tests/validation/CL/PoolingLayer.cpp
@@ -76,6 +76,8 @@
 constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);  /**< Tolerance value for comparing reference's output against implementation's output for 8-bit asymmetric type */
 const auto                           pool_data_layout_dataset = framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC });
 
+const auto pool_fp_mixed_precision_dataset = framework::dataset::make("FpMixedPrecision", { true, false });
+
 } // namespace
 
 TEST_SUITE(CL)
@@ -125,6 +127,9 @@
 template <typename T>
 using CLSpecialPoolingLayerFixture = SpecialPoolingLayerValidationFixture<CLTensor, CLAccessor, CLPoolingLayer, T>;
 
+template <typename T>
+using CLMixedPrecesionPoolingLayerFixture = PoolingLayerValidationMixedPrecisionFixture<CLTensor, CLAccessor, CLPoolingLayer, T>;
+
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSpecial, CLSpecialPoolingLayerFixture<float>, framework::DatasetMode::ALL, datasets::PoolingLayerDatasetSpecial() * framework::dataset::make("DataType", DataType::F32))
@@ -151,16 +156,18 @@
 TEST_SUITE_END() // FP32
 
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFPSmall,
-                                                                                                                 framework::dataset::make("DataType", DataType::F16))),
-                                                                                                         pool_data_layout_dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLMixedPrecesionPoolingLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFPSmall,
+                                                                                                                       framework::dataset::make("DataType", DataType::F16))),
+                                                                                                                       pool_data_layout_dataset),
+                                                                                                                       pool_fp_mixed_precision_dataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLPoolingLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP,
-                                                                                                               framework::dataset::make("DataType", DataType::F16))),
-                                                                                                       pool_data_layout_dataset))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLMixedPrecesionPoolingLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP,
+                                                                                                                     framework::dataset::make("DataType", DataType::F16))),
+                                                                                                                     pool_data_layout_dataset),
+                                                                                                                     pool_fp_mixed_precision_dataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
diff --git a/tests/validation/fixtures/PoolingLayerFixture.h b/tests/validation/fixtures/PoolingLayerFixture.h
index 1813ef4..cdc2cae 100644
--- a/tests/validation/fixtures/PoolingLayerFixture.h
+++ b/tests/validation/fixtures/PoolingLayerFixture.h
@@ -141,6 +141,18 @@
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class PoolingLayerValidationMixedPrecisionFixture : public PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(TensorShape shape, PoolingType pool_type, Size2D pool_size, PadStrideInfo pad_stride_info, bool exclude_padding, DataType data_type, DataLayout data_layout, bool fp_mixed_precision = false)
+    {
+        PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, PoolingLayerInfo(pool_type, pool_size, pad_stride_info, exclude_padding, fp_mixed_precision),
+                                                                                               data_type, data_layout);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
 class PoolingLayerValidationQuantizedFixture : public PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
diff --git a/tests/validation/reference/PoolingLayer.cpp b/tests/validation/reference/PoolingLayer.cpp
index 34b19ff..010412c 100644
--- a/tests/validation/reference/PoolingLayer.cpp
+++ b/tests/validation/reference/PoolingLayer.cpp
@@ -37,8 +37,8 @@
 {
 using namespace arm_compute::misc::shape_calculator;
 
-template <typename T>
-SimpleTensor<T> pooling_layer(const SimpleTensor<T> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo)
+template <typename T, typename ACC_T, typename std::enable_if<is_floating_point<T>::value, int>::type>
+SimpleTensor<T> pooling_layer_internal(const SimpleTensor<T> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo)
 {
     ARM_COMPUTE_UNUSED(output_qinfo); // requantization occurs in pooling_layer<uint8_t>
     ARM_COMPUTE_ERROR_ON(info.is_global_pooling() && (src.shape().x() != src.shape().y()));
@@ -79,12 +79,12 @@
                     wstart     = std::max(wstart, 0);
                     hstart     = std::max(hstart, 0);
 
-                    T max_val = std::numeric_limits<T>::lowest();
+                    auto max_val = std::numeric_limits<ACC_T>::lowest();
                     for(int y = hstart; y < hend; ++y)
                     {
                         for(int x = wstart; x < wend; ++x)
                         {
-                            const T val = src[r * h_src * w_src + y * w_src + x];
+                            const auto val = static_cast<ACC_T>(src[r * h_src * w_src + y * w_src + x]);
                             if(val > max_val)
                             {
                                 max_val = val;
@@ -92,7 +92,7 @@
                         }
                     }
 
-                    dst[r * h_dst * w_dst + h * w_dst + w] = max_val;
+                    dst[r * h_dst * w_dst + h * w_dst + w] = static_cast<T>(max_val);
                 }
             }
         }
@@ -105,16 +105,16 @@
             {
                 for(int w = 0; w < w_dst; ++w)
                 {
-                    T   avg_val(0);
-                    int wstart = w * pool_stride_x - pad_left;
-                    int hstart = h * pool_stride_y - pad_top;
-                    int wend   = std::min(wstart + pool_size_x, w_src + pad_right);
-                    int hend   = std::min(hstart + pool_size_y, h_src + pad_bottom);
-                    int pool   = (hend - hstart) * (wend - wstart);
-                    wstart     = std::max(wstart, 0);
-                    hstart     = std::max(hstart, 0);
-                    wend       = std::min(wend, w_src);
-                    hend       = std::min(hend, h_src);
+                    ACC_T avg_val(0);
+                    int   wstart = w * pool_stride_x - pad_left;
+                    int   hstart = h * pool_stride_y - pad_top;
+                    int   wend   = std::min(wstart + pool_size_x, w_src + pad_right);
+                    int   hend   = std::min(hstart + pool_size_y, h_src + pad_bottom);
+                    int   pool   = (hend - hstart) * (wend - wstart);
+                    wstart       = std::max(wstart, 0);
+                    hstart       = std::max(hstart, 0);
+                    wend         = std::min(wend, w_src);
+                    hend         = std::min(hend, h_src);
                     // Exclude padding pixels from the average
                     if(exclude_padding)
                     {
@@ -127,7 +127,7 @@
                         {
                             for(int x = wstart; x < wend; ++x)
                             {
-                                avg_val += src[r * h_src * w_src + y * w_src + x];
+                                avg_val += static_cast<ACC_T>(src[r * h_src * w_src + y * w_src + x]);
                             }
                         }
                         dst[r * h_dst * w_dst + h * w_dst + w] = avg_val / pool;
@@ -138,11 +138,11 @@
                         {
                             for(int x = wstart; x < wend; ++x)
                             {
-                                const T val = src[r * h_src * w_src + y * w_src + x];
+                                const auto val = static_cast<ACC_T>(src[r * h_src * w_src + y * w_src + x]);
                                 avg_val += val * val;
                             }
                         }
-                        dst[r * h_dst * w_dst + h * w_dst + w] = std::sqrt(avg_val / pool);
+                        dst[r * h_dst * w_dst + h * w_dst + w] = static_cast<T>(std::sqrt(avg_val / pool));
                     }
                 }
             }
@@ -152,17 +152,37 @@
     return dst;
 }
 
+template SimpleTensor<float> pooling_layer_internal<float>(const SimpleTensor<float> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo);
+template SimpleTensor<half> pooling_layer_internal<half>(const SimpleTensor<half> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo);
+template SimpleTensor<half> pooling_layer_internal<half, float>(const SimpleTensor<half> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo);
+
+template <typename T>
+SimpleTensor<T> pooling_layer(const SimpleTensor<T> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo)
+{
+    return pooling_layer_internal<T, T>(src, info, output_qinfo);
+}
+
 template <>
 SimpleTensor<uint8_t> pooling_layer<uint8_t>(const SimpleTensor<uint8_t> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo)
 {
     SimpleTensor<float>   src_tmp = convert_from_asymmetric(src);
-    SimpleTensor<float>   dst_tmp = pooling_layer<float>(src_tmp, info, output_qinfo);
+    SimpleTensor<float>   dst_tmp = pooling_layer_internal<float>(src_tmp, info, output_qinfo);
     SimpleTensor<uint8_t> dst     = convert_to_asymmetric<uint8_t>(dst_tmp, output_qinfo);
     return dst;
 }
 
+template <>
+SimpleTensor<half> pooling_layer(const SimpleTensor<half> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo)
+{
+    if(src.data_type() == DataType::F16 && info.fp_mixed_precision())
+    {
+        return pooling_layer_internal<half, float>(src, info, output_qinfo);
+    }
+
+    return pooling_layer_internal<half>(src, info, output_qinfo);
+}
+
 template SimpleTensor<float> pooling_layer(const SimpleTensor<float> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo);
-template SimpleTensor<half> pooling_layer(const SimpleTensor<half> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/PoolingLayer.h b/tests/validation/reference/PoolingLayer.h
index 1c0b7ff..fc36d51 100644
--- a/tests/validation/reference/PoolingLayer.h
+++ b/tests/validation/reference/PoolingLayer.h
@@ -35,6 +35,8 @@
 {
 namespace reference
 {
+template <typename T, typename ACC_T = T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
+SimpleTensor<T> pooling_layer_internal(const SimpleTensor<T> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo);
 template <typename T>
 SimpleTensor<T> pooling_layer(const SimpleTensor<T> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo);
 } // namespace reference