COMPMID-1188: Add support for activation in NEBatchNormalization.

Change-Id: I1e206574dac6433218db6e138adb7bf5f66a536d
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/145222
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/dup_n.h b/arm_compute/core/NEON/wrapper/intrinsics/dup_n.h
index 1c07b4f..4d9a795 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/dup_n.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/dup_n.h
@@ -45,6 +45,9 @@
 VDUP_N_IMPL(uint32_t, uint32x2_t, vdup_n, u32, traits::vector_64_tag)
 VDUP_N_IMPL(int32_t, int32x2_t, vdup_n, s32, traits::vector_64_tag)
 VDUP_N_IMPL(float, float32x2_t, vdup_n, f32, traits::vector_64_tag)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VDUP_N_IMPL(float16_t, float16x4_t, vdup_n, f16, traits::vector_64_tag)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 VDUP_N_IMPL(uint8_t, uint8x16_t, vdupq_n, u8, traits::vector_128_tag)
 VDUP_N_IMPL(int8_t, int8x16_t, vdupq_n, s8, traits::vector_128_tag)
@@ -53,6 +56,9 @@
 VDUP_N_IMPL(uint32_t, uint32x4_t, vdupq_n, u32, traits::vector_128_tag)
 VDUP_N_IMPL(int32_t, int32x4_t, vdupq_n, s32, traits::vector_128_tag)
 VDUP_N_IMPL(float, float32x4_t, vdupq_n, f32, traits::vector_128_tag)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VDUP_N_IMPL(float16_t, float16x8_t, vdupq_n, f16, traits::vector_128_tag)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 #undef VDUP_N_IMPL
 } // namespace wrapper
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/max.h b/arm_compute/core/NEON/wrapper/intrinsics/max.h
index 1a8e95d..05ed051 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/max.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/max.h
@@ -43,6 +43,9 @@
 VMAX_IMPL(uint32_t, uint32x2_t, vmax, u32)
 VMAX_IMPL(int32_t, int32x2_t, vmax, s32)
 VMAX_IMPL(float, float32x2_t, vmax, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMAX_IMPL(float16_t, float16x4_t, vmax, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 VMAX_IMPL(uint8_t, uint8x16_t, vmaxq, u8)
 VMAX_IMPL(int8_t, int8x16_t, vmaxq, s8)
@@ -51,6 +54,9 @@
 VMAX_IMPL(uint32_t, uint32x4_t, vmaxq, u32)
 VMAX_IMPL(int32_t, int32x4_t, vmaxq, s32)
 VMAX_IMPL(float, float32x4_t, vmaxq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMAX_IMPL(float16_t, float16x8_t, vmaxq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 #undef VMAX_IMPL
 } // namespace wrapper
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/min.h b/arm_compute/core/NEON/wrapper/intrinsics/min.h
index ae79631..5ea2068 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/min.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/min.h
@@ -43,6 +43,9 @@
 VMIN_IMPL(uint32_t, uint32x2_t, vmin, u32)
 VMIN_IMPL(int32_t, int32x2_t, vmin, s32)
 VMIN_IMPL(float, float32x2_t, vmin, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMIN_IMPL(float16_t, float16x4_t, vmin, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 VMIN_IMPL(uint8_t, uint8x16_t, vminq, u8)
 VMIN_IMPL(int8_t, int8x16_t, vminq, s8)
@@ -51,6 +54,9 @@
 VMIN_IMPL(uint32_t, uint32x4_t, vminq, u32)
 VMIN_IMPL(int32_t, int32x4_t, vminq, s32)
 VMIN_IMPL(float, float32x4_t, vminq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMIN_IMPL(float16_t, float16x8_t, vminq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 #undef VMIN_IMPL
 } // namespace wrapper