COMPMID-1580 Implement ReduceMean in NEON

Change-Id: Id974efad304c2513b8824a6561ad45ee60b9e7fb
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/153763
Reviewed-by: Giuseppe Rossini <giuseppe.rossini@arm.com>
Reviewed-by: Isabella Gottardi <isabella.gottardi@arm.com>
Tested-by: bsgcomp <bsgcomp@arm.com>
diff --git a/arm_compute/core/NEON/kernels/NEReductionOperationKernel.h b/arm_compute/core/NEON/kernels/NEReductionOperationKernel.h
index a20cd46..a4cb330 100644
--- a/arm_compute/core/NEON/kernels/NEReductionOperationKernel.h
+++ b/arm_compute/core/NEON/kernels/NEReductionOperationKernel.h
@@ -53,7 +53,7 @@
 
     /** Set the source, destination of the kernel
      *
-     * @param[in]  input  Source tensor. Data type supported: F32. Data layouts supported: NCHW.
+     * @param[in]  input  Source tensor. Data type supported: QASYMM8/F16/F32. Data layouts supported: NCHW.
      * @param[out] output Destination tensor.Data types and data layouts supported: same as @p input.
      *                    Output will have the same number of dimensions as input.
      * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0
@@ -63,7 +63,7 @@
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEReductionOperationKernel.
      *
-     * @param[in] input  Source tensor info. Data type supported: F32. Data layouts supported: NCHW.
+     * @param[in] input  Source tensor info. Data type supported: QASYMM8/F16/F32. Data layouts supported: NCHW.
      * @param[in] output Destination tensor info.Data types and data layouts supported: same as @p input.
      *                   Output will have the same number of dimensions as input.
      * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/gethigh.h b/arm_compute/core/NEON/wrapper/intrinsics/gethigh.h
new file mode 100644
index 0000000..47b0116
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/gethigh.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_GET_HIGH_H__
+#define __ARM_COMPUTE_WRAPPER_GET_HIGH_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VGETHIGH_IMPL(half_vtype, vtype, postfix) \
+    inline half_vtype vgethigh(const vtype val)   \
+    {                                             \
+        return vget_high_##postfix(val);          \
+    }
+
+VGETHIGH_IMPL(uint8x8_t, uint8x16_t, u8)
+VGETHIGH_IMPL(int8x8_t, int8x16_t, s8)
+VGETHIGH_IMPL(uint16x4_t, uint16x8_t, u16)
+VGETHIGH_IMPL(int16x4_t, int16x8_t, s16)
+VGETHIGH_IMPL(uint32x2_t, uint32x4_t, u32)
+VGETHIGH_IMPL(int32x2_t, int32x4_t, s32)
+VGETHIGH_IMPL(float32x2_t, float32x4_t, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VGETHIGH_IMPL(float16x4_t, float16x8_t, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VGETHIGH_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_GET_HIGH_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/getlane.h b/arm_compute/core/NEON/wrapper/intrinsics/getlane.h
new file mode 100644
index 0000000..107ce44
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/getlane.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_GET_LANE_H__
+#define __ARM_COMPUTE_WRAPPER_GET_LANE_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VGETLANE_IMPL_8(stype, vtype, postfix)                \
+    inline stype vgetlane(const vtype vector, const int lane) \
+    {                                                         \
+        switch(lane)                                          \
+        {                                                     \
+            case 0:                                           \
+                return vget_lane_##postfix(vector, 0);        \
+            case 1:                                           \
+                return vget_lane_##postfix(vector, 1);        \
+            case 2:                                           \
+                return vget_lane_##postfix(vector, 2);        \
+            case 3:                                           \
+                return vget_lane_##postfix(vector, 3);        \
+            case 4:                                           \
+                return vget_lane_##postfix(vector, 4);        \
+            case 5:                                           \
+                return vget_lane_##postfix(vector, 5);        \
+            case 6:                                           \
+                return vget_lane_##postfix(vector, 6);        \
+            case 7:                                           \
+                return vget_lane_##postfix(vector, 7);        \
+            default:                                          \
+                ARM_COMPUTE_ERROR("Invalid lane");            \
+        }                                                     \
+    }
+
+#define VGETLANE_IMPL_4(stype, vtype, postfix)                \
+    inline stype vgetlane(const vtype vector, const int lane) \
+    {                                                         \
+        switch(lane)                                          \
+        {                                                     \
+            case 0:                                           \
+                return vget_lane_##postfix(vector, 0);        \
+            case 1:                                           \
+                return vget_lane_##postfix(vector, 1);        \
+            case 2:                                           \
+                return vget_lane_##postfix(vector, 2);        \
+            case 3:                                           \
+                return vget_lane_##postfix(vector, 3);        \
+            default:                                          \
+                ARM_COMPUTE_ERROR("Invalid lane");            \
+        }                                                     \
+    }
+
+#define VGETLANE_IMPL_2(stype, vtype, postfix)                \
+    inline stype vgetlane(const vtype vector, const int lane) \
+    {                                                         \
+        switch(lane)                                          \
+        {                                                     \
+            case 0:                                           \
+                return vget_lane_##postfix(vector, 0);        \
+            case 1:                                           \
+                return vget_lane_##postfix(vector, 1);        \
+            default:                                          \
+                ARM_COMPUTE_ERROR("Invalid lane");            \
+        }                                                     \
+    }
+
+VGETLANE_IMPL_8(uint8_t, uint8x8_t, u8)
+VGETLANE_IMPL_8(int8_t, int8x8_t, s8)
+VGETLANE_IMPL_4(uint16_t, uint16x4_t, u16)
+VGETLANE_IMPL_4(int16_t, int16x4_t, s16)
+VGETLANE_IMPL_2(uint32_t, uint32x2_t, u32)
+VGETLANE_IMPL_2(int32_t, int32x2_t, s32)
+VGETLANE_IMPL_2(float, float32x2_t, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VGETLANE_IMPL_4(float16_t, float16x4_t, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#define VGETQLANE_IMPL_16(stype, vtype, postfix)               \
+    inline stype vgetqlane(const vtype vector, const int lane) \
+    {                                                          \
+        switch(lane)                                           \
+        {                                                      \
+            case 0:                                            \
+                return vgetq_lane_##postfix(vector, 0);        \
+            case 1:                                            \
+                return vgetq_lane_##postfix(vector, 1);        \
+            case 2:                                            \
+                return vgetq_lane_##postfix(vector, 2);        \
+            case 3:                                            \
+                return vgetq_lane_##postfix(vector, 3);        \
+            case 4:                                            \
+                return vgetq_lane_##postfix(vector, 4);        \
+            case 5:                                            \
+                return vgetq_lane_##postfix(vector, 5);        \
+            case 6:                                            \
+                return vgetq_lane_##postfix(vector, 6);        \
+            case 7:                                            \
+                return vgetq_lane_##postfix(vector, 7);        \
+            case 8:                                            \
+                return vgetq_lane_##postfix(vector, 8);        \
+            case 9:                                            \
+                return vgetq_lane_##postfix(vector, 9);        \
+            case 10:                                           \
+                return vgetq_lane_##postfix(vector, 10);       \
+            case 11:                                           \
+                return vgetq_lane_##postfix(vector, 11);       \
+            case 12:                                           \
+                return vgetq_lane_##postfix(vector, 12);       \
+            case 13:                                           \
+                return vgetq_lane_##postfix(vector, 13);       \
+            case 14:                                           \
+                return vgetq_lane_##postfix(vector, 14);       \
+            case 15:                                           \
+                return vgetq_lane_##postfix(vector, 15);       \
+            default:                                           \
+                ARM_COMPUTE_ERROR("Invalid lane");             \
+        }                                                      \
+    }
+
+#define VGETQLANE_IMPL_8(stype, vtype, postfix)                \
+    inline stype vgetqlane(const vtype vector, const int lane) \
+    {                                                          \
+        switch(lane)                                           \
+        {                                                      \
+            case 0:                                            \
+                return vgetq_lane_##postfix(vector, 0);        \
+            case 1:                                            \
+                return vgetq_lane_##postfix(vector, 1);        \
+            case 2:                                            \
+                return vgetq_lane_##postfix(vector, 2);        \
+            case 3:                                            \
+                return vgetq_lane_##postfix(vector, 3);        \
+            case 4:                                            \
+                return vgetq_lane_##postfix(vector, 4);        \
+            case 5:                                            \
+                return vgetq_lane_##postfix(vector, 5);        \
+            case 6:                                            \
+                return vgetq_lane_##postfix(vector, 6);        \
+            case 7:                                            \
+                return vgetq_lane_##postfix(vector, 7);        \
+            default:                                           \
+                ARM_COMPUTE_ERROR("Invalid lane");             \
+        }                                                      \
+    }
+
+#define VGETQLANE_IMPL_4(stype, vtype, postfix)                \
+    inline stype vgetqlane(const vtype vector, const int lane) \
+    {                                                          \
+        switch(lane)                                           \
+        {                                                      \
+            case 0:                                            \
+                return vgetq_lane_##postfix(vector, 0);        \
+            case 1:                                            \
+                return vgetq_lane_##postfix(vector, 1);        \
+            case 2:                                            \
+                return vgetq_lane_##postfix(vector, 2);        \
+            case 3:                                            \
+                return vgetq_lane_##postfix(vector, 3);        \
+            default:                                           \
+                ARM_COMPUTE_ERROR("Invalid lane");             \
+        }                                                      \
+    }
+
+VGETQLANE_IMPL_16(uint8_t, uint8x16_t, u8)
+VGETQLANE_IMPL_16(int8_t, int8x16_t, s8)
+VGETQLANE_IMPL_8(uint16_t, uint16x8_t, u16)
+VGETQLANE_IMPL_8(int16_t, int16x8_t, s16)
+VGETQLANE_IMPL_4(uint32_t, uint32x4_t, u32)
+VGETQLANE_IMPL_4(int32_t, int32x4_t, s32)
+VGETQLANE_IMPL_4(float, float32x4_t, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VGETQLANE_IMPL_8(float16_t, float16x8_t, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VGETLANE_IMPL_8
+#undef VGETLANE_IMPL_4
+#undef VGETLANE_IMPL_2
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_GET_LANE_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/getlow.h b/arm_compute/core/NEON/wrapper/intrinsics/getlow.h
new file mode 100644
index 0000000..cc5d8bb
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/getlow.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_GET_LOW_H__
+#define __ARM_COMPUTE_WRAPPER_GET_LOW_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VGETLOW_IMPL(half_vtype, vtype, postfix) \
+    inline half_vtype vgetlow(const vtype val)   \
+    {                                            \
+        return vget_low_##postfix(val);          \
+    }
+
+VGETLOW_IMPL(uint8x8_t, uint8x16_t, u8)
+VGETLOW_IMPL(int8x8_t, int8x16_t, s8)
+VGETLOW_IMPL(uint16x4_t, uint16x8_t, u16)
+VGETLOW_IMPL(int16x4_t, int16x8_t, s16)
+VGETLOW_IMPL(uint32x2_t, uint32x4_t, u32)
+VGETLOW_IMPL(int32x2_t, int32x4_t, s32)
+VGETLOW_IMPL(float32x2_t, float32x4_t, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VGETLOW_IMPL(float16x4_t, float16x8_t, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VGETLOW_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_GET_LOW_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
index 58bfba9..2e6fd75 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
@@ -28,13 +28,19 @@
 #include "arm_compute/core/NEON/wrapper/intrinsics/and.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/dup_n.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/exp.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/gethigh.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/getlane.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/getlow.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/inv.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/load.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/max.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/min.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/mla.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/movl.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/movn.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/mul.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/neg.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/padd.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/store.h"
 
 #endif /* __ARM_COMPUTE_WRAPPER_INTRINSICS_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/load.h b/arm_compute/core/NEON/wrapper/intrinsics/load.h
index 442d857..b5d9ed2 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/load.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/load.h
@@ -45,6 +45,9 @@
 //VLOAD_IMPL(uint64_t, uint64x1_t, u64)
 //VLOAD_IMPL(int64_t, int64x1_t, s64)
 VLOAD_IMPL(float, float32x2_t, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VLOAD_IMPL(float16_t, float16x4_t, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 #define VLOADQ_IMPL(stype, vtype, postfix) \
     inline vtype vloadq(const stype *ptr)  \
@@ -61,6 +64,9 @@
 //VLOAD_IMPL(uint64_t, uint64x1_t, u64)
 //VLOAD_IMPL(int64_t, int64x1_t, s64)
 VLOADQ_IMPL(float, float32x4_t, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VLOADQ_IMPL(float16_t, float16x8_t, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 #undef VLOAD_IMPL
 } // namespace wrapper
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/movl.h b/arm_compute/core/NEON/wrapper/intrinsics/movl.h
new file mode 100644
index 0000000..728fe4e
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/movl.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_MOVL_H__
+#define __ARM_COMPUTE_WRAPPER_MOVL_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VMOVL_IMPL(ptype, vtype, prefix, postfix) \
+    inline ptype vmovl(const vtype &a)            \
+    {                                             \
+        return prefix##_##postfix(a);             \
+    }
+
+VMOVL_IMPL(uint16x8_t, uint8x8_t, vmovl, u8)
+VMOVL_IMPL(int16x8_t, int8x8_t, vmovl, s8)
+VMOVL_IMPL(uint32x4_t, uint16x4_t, vmovl, u16)
+VMOVL_IMPL(int32x4_t, int16x4_t, vmovl, s16)
+VMOVL_IMPL(uint64x2_t, uint32x2_t, vmovl, u32)
+VMOVL_IMPL(int64x2_t, int32x2_t, vmovl, s32)
+
+#undef VMOVL_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_MOVL_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/movn.h b/arm_compute/core/NEON/wrapper/intrinsics/movn.h
new file mode 100644
index 0000000..6ed262e
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/movn.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_MOVN_H__
+#define __ARM_COMPUTE_WRAPPER_MOVN_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VMOVN_IMPL(dtype, vtype, prefix, postfix) \
+    inline dtype vmovn(const vtype &a)            \
+    {                                             \
+        return prefix##_##postfix(a);             \
+    }
+
+VMOVN_IMPL(uint32x2_t, uint64x2_t, vmovn, u64)
+VMOVN_IMPL(int32x2_t, int64x2_t, vmovn, s64)
+VMOVN_IMPL(uint16x4_t, uint32x4_t, vmovn, u32)
+VMOVN_IMPL(int16x4_t, int32x4_t, vmovn, s32)
+VMOVN_IMPL(uint8x8_t, uint16x8_t, vmovn, u16)
+VMOVN_IMPL(int8x8_t, int16x8_t, vmovn, s16)
+
+#define VQMOVN_IMPL(dtype, vtype, prefix, postfix) \
+    inline dtype vqmovn(const vtype &a)            \
+    {                                              \
+        return prefix##_##postfix(a);              \
+    }
+
+VQMOVN_IMPL(uint32x2_t, uint64x2_t, vqmovn, u64)
+VQMOVN_IMPL(int32x2_t, int64x2_t, vqmovn, s64)
+VQMOVN_IMPL(uint16x4_t, uint32x4_t, vqmovn, u32)
+VQMOVN_IMPL(int16x4_t, int32x4_t, vqmovn, s32)
+VQMOVN_IMPL(uint8x8_t, uint16x8_t, vqmovn, u16)
+VQMOVN_IMPL(int8x8_t, int16x8_t, vqmovn, s16)
+
+#undef VMOVN_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_MOVN_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/mul.h b/arm_compute/core/NEON/wrapper/intrinsics/mul.h
index c1908fc..932b746 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/mul.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/mul.h
@@ -43,6 +43,9 @@
 VMUL_IMPL(uint32x2_t, uint32x2_t, vmul, u32)
 VMUL_IMPL(int32x2_t, int32x2_t, vmul, s32)
 VMUL_IMPL(float32x2_t, float32x2_t, vmul, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMUL_IMPL(float16_t, float16x4_t, vmul, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 VMUL_IMPL(uint8_t, uint8x16_t, vmulq, u8)
 VMUL_IMPL(int8_t, int8x16_t, vmulq, s8)
@@ -51,6 +54,9 @@
 VMUL_IMPL(uint32_t, uint32x4_t, vmulq, u32)
 VMUL_IMPL(int32_t, int32x4_t, vmulq, s32)
 VMUL_IMPL(float32x4_t, float32x4_t, vmulq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMUL_IMPL(float16_t, float16x8_t, vmulq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 #undef VMUL_IMPL
 } // namespace wrapper
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/padd.h b/arm_compute/core/NEON/wrapper/intrinsics/padd.h
new file mode 100644
index 0000000..5ee2173
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/padd.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_PADD_H__
+#define __ARM_COMPUTE_WRAPPER_PADD_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VPADD_IMPL(stype, vtype, prefix, postfix)      \
+    inline vtype vpadd(const vtype &a, const vtype &b) \
+    {                                                  \
+        return prefix##_##postfix(a, b);               \
+    }
+
+VPADD_IMPL(uint8x8_t, uint8x8_t, vpadd, u8)
+VPADD_IMPL(int8x8_t, int8x8_t, vpadd, s8)
+VPADD_IMPL(uint16x4_t, uint16x4_t, vpadd, u16)
+VPADD_IMPL(int16x4_t, int16x4_t, vpadd, s16)
+VPADD_IMPL(uint32x2_t, uint32x2_t, vpadd, u32)
+VPADD_IMPL(int32x2_t, int32x2_t, vpadd, s32)
+VPADD_IMPL(float32x2_t, float32x2_t, vpadd, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VPADD_IMPL(float16x4_t, float16x4_t, vpadd, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VPADD_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_PADD_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/store.h b/arm_compute/core/NEON/wrapper/intrinsics/store.h
index be89602..35c4279 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/store.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/store.h
@@ -45,6 +45,9 @@
 //VSTORE_IMPL(uint64_t, 1, vst1, u64)
 //VSTORE_IMPL(int64_t, 1, vst1, s64)
 VSTORE_IMPL(float, float32x2_t, vst1, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VSTORE_IMPL(float16_t, float16x4_t, vst1, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 VSTORE_IMPL(uint8_t, uint8x16_t, vst1q, u8)
 VSTORE_IMPL(int8_t, int8x16_t, vst1q, s8)
@@ -55,6 +58,9 @@
 //VSTORE_IMPL(uint64_t, 2, vst1q, u64)
 //VSTORE_IMPL(int64_t, 2, vst1q, s64)
 VSTORE_IMPL(float, float32x4_t, vst1q, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VSTORE_IMPL(float16_t, float16x8_t, vst1q, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 #undef VSTORE_IMPL
 } // namespace wrapper
diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h
index 2bf8bcd..57bd585 100644
--- a/arm_compute/runtime/NEON/NEFunctions.h
+++ b/arm_compute/runtime/NEON/NEFunctions.h
@@ -101,6 +101,7 @@
 #include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NERNNLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEReduceMean.h"
 #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
 #include "arm_compute/runtime/NEON/functions/NERemap.h"
 #include "arm_compute/runtime/NEON/functions/NEReorgLayer.h"
diff --git a/arm_compute/runtime/NEON/functions/NEReduceMean.h b/arm_compute/runtime/NEON/functions/NEReduceMean.h
new file mode 100644
index 0000000..b20ca9c
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEReduceMean.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEON_REDUCE_MEAN_H__
+#define __ARM_COMPUTE_NEON_REDUCE_MEAN_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to perform reduce operation */
+class NEReduceMean : public IFunction
+{
+public:
+    /** Constructor */
+    NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Configure kernel
+     *
+     * @note Supported tensor rank: up to 4
+     *
+     * @param[in]  input          Source tensor. Data type supported: QASYMM8/F16/F32
+     * @param[in]  reduction_axis Reduction axis vector.
+     * @param[in]  keep_dims      If positive, retains reduced dimensions with length 1.
+     * @param[out] output         Destination tensor. Data type supported: Same as @p input
+     */
+    void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, ITensor *output);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEReduceMean
+     *
+     * @param[in] input          Source tensor. Data type supported: QASYMM8/F16/F32
+     * @param[in] reduction_axis Reduction axis vector.
+     * @param[in] keep_dims      If positive, retains reduced dimensions with length 1.
+     * @param[in] output         Destination tensor. Data type supported: Same as @p input
+     *
+     * @return A status
+     */
+    static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    MemoryGroup                             _memory_group;
+    std::unique_ptr<NEReductionOperation[]> _reduction_kernels{ nullptr };
+    std::unique_ptr<Tensor[]>               _reduced_outs{ nullptr };
+    NEReshapeLayer                          _reshape;
+    unsigned int                            _reduction_ops;
+    bool                                    _keep_dims;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEON_REDUCE_MEAN_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEReductionOperation.h b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
index 02b29fb..5bc7059 100644
--- a/arm_compute/runtime/NEON/functions/NEReductionOperation.h
+++ b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
@@ -47,16 +47,16 @@
     NEReductionOperation();
     /** Set the input and output tensors.
      *
-     * @param[in, out] input  Source tensor. Data type supported: F32. Data layouts supported: NCHW. (Written to only for border_size != 0)
-     * @param[out]     output Destination tensor. Data types and data layouts supported: same as @p input.
-     * @param[in]      axis   Dimension along which to reduce. Supported reduction axis : 0
-     * @param[in]      op     Reduction operation to perform.
+     * @param[in]  input  Source tensor. Data type supported: QASYMM8/F16/F32. Data layouts supported: NCHW. (Written to only for border_size != 0)
+     * @param[out] output Destination tensor. Data types and data layouts supported: same as @p input.
+     * @param[in]  axis   Dimension along which to reduce. Supported reduction axis : 0
+     * @param[in]  op     Reduction operation to perform.
      */
     void configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEReductionOperation.
      *
-     * @param[in] input  Source tensor info. Data type supported: F32. Data layouts supported: NCHW. (Written to only for border_size != 0)
+     * @param[in] input  Source tensor info. Data type supported: QASYMM8/F16/F32. Data layouts supported: NCHW. (Written to only for border_size != 0)
      * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p input.
      * @param[in] axis   Dimension along which to reduce. Supported reduction axis : 0
      * @param[in] op     Reduction operation to perform.
@@ -72,6 +72,7 @@
     NEReductionOperationKernel _reduction_kernel;
     NEFillBorderKernel         _fill_border_kernel;
     size_t                     _window_split;
+    int                        _reduction_axis;
 };
-}
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_NEREDUCTIONOPERATION_H__ */