Move utility functions to NE/SVEMath

To avoid unused function warnings when only
partial data types are selected, the definition
of functions are moved.

Partially Resolves: COMPMID-4282

Change-Id: Ic30ddd3f2c88cac5978d27e5f4ada3639b5a04e5
Signed-off-by: Sang-Hoon Park <sang-hoon.park@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5215
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/NEON/NEMath.h b/src/core/NEON/NEMath.h
index 877ffb2..13484c9 100644
--- a/src/core/NEON/NEMath.h
+++ b/src/core/NEON/NEMath.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -205,6 +205,24 @@
  */
 void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out);
 
+/** Converts from float vector to integer vector
+ *
+ * @param[in] in Float vector to converted
+ *
+ * @return The converted integer vector
+ */
+template <typename float_vec_type, typename int_vec_type>
+int_vec_type convert_float_to_int(const float_vec_type &in);
+
+/** Converts from integer vector to float vector
+ *
+ * @param[in] in Integer vector to converted
+ *
+ * @return The converted float vector
+ */
+template <typename float_vec_type, typename int_vec_type>
+float_vec_type convert_int_to_float(const int_vec_type &in);
+
 /** Calculate sine.
  *
  * @param[in] val Input vector value in radians, F32 format.
diff --git a/src/core/NEON/NEMath.inl b/src/core/NEON/NEMath.inl
index 1f5cb56..da9d038 100644
--- a/src/core/NEON/NEMath.inl
+++ b/src/core/NEON/NEMath.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -387,6 +387,34 @@
     out = vcombine_s8(vqmovn_s16(low), vqmovn_s16(high));
 }
 
+template <>
+inline uint8x16_t convert_float_to_int<float32x4x4_t, uint8x16_t>(const float32x4x4_t &in)
+{
+    uint8x16_t out;
+    convert_float32x4x4_to_uint8x16(in, out);
+    return out;
+}
+
+template <>
+inline float32x4x4_t convert_int_to_float<float32x4x4_t, uint8x16_t>(const uint8x16_t &in)
+{
+    return convert_uint8x16_to_float32x4x4(in);
+}
+
+template <>
+inline int8x16_t convert_float_to_int<float32x4x4_t, int8x16_t>(const float32x4x4_t &in)
+{
+    int8x16_t out;
+    convert_float32x4x4_to_int8x16(in, out);
+    return out;
+}
+
+template <>
+inline float32x4x4_t convert_int_to_float<float32x4x4_t, int8x16_t>(const int8x16_t &in)
+{
+    return convert_int8x16_to_float32x4x4(in);
+}
+
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 /** Exponent polynomial coefficients */
 /** Logarithm polynomial coefficients */
diff --git a/src/core/NEON/SVEMath.h b/src/core/NEON/SVEMath.h
index 075cb45..b73043a 100644
--- a/src/core/NEON/SVEMath.h
+++ b/src/core/NEON/SVEMath.h
@@ -171,6 +171,18 @@
  */
 svfloat16_t svpow_f16_z(svbool_t pg, svfloat16_t a, svfloat16_t b);
 
+/** Convert and pack four 32-bit float vectors into an 8-bit integer vector
+ *
+ * @param[in] in_0 The first float vector
+ * @param[in] in_1 The second float vector
+ * @param[in] in_2 The third float vector
+ * @param[in] in_3 The fourth float vector
+ *
+ * @return The converted integer vector
+ */
+template <typename int_vec_type>
+int_vec_type convert_float_to_int(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3);
+
 } // namespace arm_compute
 #include "src/core/NEON/SVEMath.inl"
 #endif /* defined(__ARM_FEATURE_SVE) */
diff --git a/src/core/NEON/SVEMath.inl b/src/core/NEON/SVEMath.inl
index cf7f9f5..a851b8a 100644
--- a/src/core/NEON/SVEMath.inl
+++ b/src/core/NEON/SVEMath.inl
@@ -325,5 +325,73 @@
 #endif /* defined(__ARM_FEATURE_SVE2) */
 }
 
+template <>
+inline svuint8_t convert_float_to_int<svuint8_t>(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3)
+{
+    svuint8_t  out;
+    const auto all_true_pg = svptrue_b32();
+    auto       tmp_0       = svcvt_u32_f32_z(all_true_pg, in_0);
+    auto       tmp_1       = svcvt_u32_f32_z(all_true_pg, in_1);
+    auto       tmp_2       = svcvt_u32_f32_z(all_true_pg, in_2);
+    auto       tmp_3       = svcvt_u32_f32_z(all_true_pg, in_3);
+
+    auto tmp_16_0 = svqxtnt_u32(svqxtnb_u32(tmp_0), tmp_1);
+    auto tmp_16_1 = svqxtnt_u32(svqxtnb_u32(tmp_2), tmp_3);
+
+    auto tmp_16_uzp_0 = svuzp1(tmp_16_0, tmp_16_0);
+    auto tmp_16_uzp_1 = svuzp2(tmp_16_0, tmp_16_0);
+    auto tmp_16_uzp_2 = svuzp1(tmp_16_1, tmp_16_1);
+    auto tmp_16_uzp_3 = svuzp2(tmp_16_1, tmp_16_1);
+
+    auto pg = svwhilelt_b16_s32(0, svcnth() / 2);
+
+    tmp_16_0 = svsplice(pg, tmp_16_uzp_0, tmp_16_uzp_1);
+    tmp_16_1 = svsplice(pg, tmp_16_uzp_2, tmp_16_uzp_3);
+
+    out = svqxtnt_u16(svqxtnb_u16(tmp_16_0), tmp_16_1);
+
+    auto out_uzp_0 = svuzp1(out, out);
+    auto out_uzp_1 = svuzp2(out, out);
+
+    pg  = svwhilelt_b8_s32(0, svcntb() / 2);
+    out = svsplice(pg, out_uzp_0, out_uzp_1);
+
+    return out;
+}
+
+template <>
+inline svint8_t convert_float_to_int<svint8_t>(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3)
+{
+    svint8_t   out;
+    const auto all_true_pg = svptrue_b32();
+    auto       tmp_0       = svcvt_s32_f32_z(all_true_pg, in_0);
+    auto       tmp_1       = svcvt_s32_f32_z(all_true_pg, in_1);
+    auto       tmp_2       = svcvt_s32_f32_z(all_true_pg, in_2);
+    auto       tmp_3       = svcvt_s32_f32_z(all_true_pg, in_3);
+
+    auto tmp_16_0 = svqxtnt_s32(svqxtnb_s32(tmp_0), tmp_1);
+    auto tmp_16_1 = svqxtnt_s32(svqxtnb_s32(tmp_2), tmp_3);
+
+    auto tmp_16_uzp_0 = svuzp1(tmp_16_0, tmp_16_0);
+    auto tmp_16_uzp_1 = svuzp2(tmp_16_0, tmp_16_0);
+    auto tmp_16_uzp_2 = svuzp1(tmp_16_1, tmp_16_1);
+    auto tmp_16_uzp_3 = svuzp2(tmp_16_1, tmp_16_1);
+
+    auto pg = svwhilelt_b16_s32(0, svcnth() / 2);
+
+    tmp_16_0 = svsplice(pg, tmp_16_uzp_0, tmp_16_uzp_1);
+    tmp_16_1 = svsplice(pg, tmp_16_uzp_2, tmp_16_uzp_3);
+
+    out = svqxtnt_s16(svqxtnb_s16(tmp_16_0), tmp_16_1);
+
+    auto out_uzp_0 = svuzp1(out, out);
+    auto out_uzp_1 = svuzp2(out, out);
+
+    pg  = svwhilelt_b8_s32(0, svcntb() / 2);
+    out = svsplice(pg, out_uzp_0, out_uzp_1);
+
+    return out;
+}
+
 } // namespace arm_compute
 #endif /* defined(__ARM_FEATURE_SVE) */
diff --git a/src/core/cpu/kernels/softmax/impl/NEON/list.h b/src/core/cpu/kernels/softmax/impl/NEON/list.h
index 3f9438e..740e6ea 100644
--- a/src/core/cpu/kernels/softmax/impl/NEON/list.h
+++ b/src/core/cpu/kernels/softmax/impl/NEON/list.h
@@ -33,43 +33,6 @@
 {
 namespace cpu
 {
-namespace
-{
-template <typename float_vec_type, typename int_vec_type>
-int_vec_type convert_float_to_int(const float_vec_type &in);
-
-template <typename float_vec_type, typename int_vec_type>
-float_vec_type convert_int_to_float(const int_vec_type &in);
-
-template <>
-uint8x16_t convert_float_to_int<float32x4x4_t, uint8x16_t>(const float32x4x4_t &in)
-{
-    uint8x16_t out;
-    convert_float32x4x4_to_uint8x16(in, out);
-    return out;
-}
-
-template <>
-int8x16_t convert_float_to_int<float32x4x4_t, int8x16_t>(const float32x4x4_t &in)
-{
-    int8x16_t out;
-    convert_float32x4x4_to_int8x16(in, out);
-    return out;
-}
-
-template <>
-float32x4x4_t convert_int_to_float<float32x4x4_t, uint8x16_t>(const uint8x16_t &in)
-{
-    return convert_uint8x16_to_float32x4x4(in);
-}
-
-template <>
-float32x4x4_t convert_int_to_float<float32x4x4_t, int8x16_t>(const int8x16_t &in)
-{
-    return convert_int8x16_to_float32x4x4(in);
-}
-} // namespace
-
 template <typename T>
 void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
 {
diff --git a/src/core/cpu/kernels/softmax/impl/SVE/list.h b/src/core/cpu/kernels/softmax/impl/SVE/list.h
index 0936bd5..d558d7d 100644
--- a/src/core/cpu/kernels/softmax/impl/SVE/list.h
+++ b/src/core/cpu/kernels/softmax/impl/SVE/list.h
@@ -35,82 +35,6 @@
 {
 namespace cpu
 {
-namespace
-{
-#if defined(__ARM_FEATURE_SVE2)
-template <typename int_vec_type>
-int_vec_type convert_float_to_int(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3);
-
-template <>
-svuint8_t convert_float_to_int<svuint8_t>(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3)
-{
-    svuint8_t  out;
-    const auto all_true_pg = svptrue_b32();
-    auto       tmp_0       = svcvt_u32_f32_z(all_true_pg, in_0);
-    auto       tmp_1       = svcvt_u32_f32_z(all_true_pg, in_1);
-    auto       tmp_2       = svcvt_u32_f32_z(all_true_pg, in_2);
-    auto       tmp_3       = svcvt_u32_f32_z(all_true_pg, in_3);
-
-    auto tmp_16_0 = svqxtnt_u32(svqxtnb_u32(tmp_0), tmp_1);
-    auto tmp_16_1 = svqxtnt_u32(svqxtnb_u32(tmp_2), tmp_3);
-
-    auto tmp_16_uzp_0 = svuzp1(tmp_16_0, tmp_16_0);
-    auto tmp_16_uzp_1 = svuzp2(tmp_16_0, tmp_16_0);
-    auto tmp_16_uzp_2 = svuzp1(tmp_16_1, tmp_16_1);
-    auto tmp_16_uzp_3 = svuzp2(tmp_16_1, tmp_16_1);
-
-    auto pg = svwhilelt_b16_s32(0, svcnth() / 2);
-
-    tmp_16_0 = svsplice(pg, tmp_16_uzp_0, tmp_16_uzp_1);
-    tmp_16_1 = svsplice(pg, tmp_16_uzp_2, tmp_16_uzp_3);
-
-    out = svqxtnt_u16(svqxtnb_u16(tmp_16_0), tmp_16_1);
-
-    auto out_uzp_0 = svuzp1(out, out);
-    auto out_uzp_1 = svuzp2(out, out);
-
-    pg  = svwhilelt_b8_s32(0, svcntb() / 2);
-    out = svsplice(pg, out_uzp_0, out_uzp_1);
-
-    return out;
-}
-
-template <>
-svint8_t convert_float_to_int<svint8_t>(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3)
-{
-    svint8_t   out;
-    const auto all_true_pg = svptrue_b32();
-    auto       tmp_0       = svcvt_s32_f32_z(all_true_pg, in_0);
-    auto       tmp_1       = svcvt_s32_f32_z(all_true_pg, in_1);
-    auto       tmp_2       = svcvt_s32_f32_z(all_true_pg, in_2);
-    auto       tmp_3       = svcvt_s32_f32_z(all_true_pg, in_3);
-
-    auto tmp_16_0 = svqxtnt_s32(svqxtnb_s32(tmp_0), tmp_1);
-    auto tmp_16_1 = svqxtnt_s32(svqxtnb_s32(tmp_2), tmp_3);
-
-    auto tmp_16_uzp_0 = svuzp1(tmp_16_0, tmp_16_0);
-    auto tmp_16_uzp_1 = svuzp2(tmp_16_0, tmp_16_0);
-    auto tmp_16_uzp_2 = svuzp1(tmp_16_1, tmp_16_1);
-    auto tmp_16_uzp_3 = svuzp2(tmp_16_1, tmp_16_1);
-
-    auto pg = svwhilelt_b16_s32(0, svcnth() / 2);
-
-    tmp_16_0 = svsplice(pg, tmp_16_uzp_0, tmp_16_uzp_1);
-    tmp_16_1 = svsplice(pg, tmp_16_uzp_2, tmp_16_uzp_3);
-
-    out = svqxtnt_s16(svqxtnb_s16(tmp_16_0), tmp_16_1);
-
-    auto out_uzp_0 = svuzp1(out, out);
-    auto out_uzp_1 = svuzp2(out, out);
-
-    pg  = svwhilelt_b8_s32(0, svcntb() / 2);
-    out = svsplice(pg, out_uzp_0, out_uzp_1);
-
-    return out;
-}
-#endif /* defined(__ARM_FEATURE_SVE2) */
-} // namespace
-
 template <typename ScalarType>
 void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
 {