COMPMID-3637: Move wrapper to src

Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: I524b0c4b49c7a7035b7d078b9585d77b0d438e10
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4083
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/NEON/NEAsymm.h b/src/core/NEON/NEAsymm.h
new file mode 100644
index 0000000..70d48d5
--- /dev/null
+++ b/src/core/NEON/NEAsymm.h
@@ -0,0 +1,753 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEASYMM_H
+#define ARM_COMPUTE_NEASYMM_H
+
+#include "src/core/NEON/NEMath.h"
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+using qasymm8x8_t   = uint8x8_t;   /**< 8 bit quantized asymmetric vector with 8 elements */
+using qasymm8x8x2_t = uint8x8x2_t; /**< 8 bit quantized asymmetric vector with 16 elements */
+using qasymm8x8x3_t = uint8x8x3_t; /**< 8 bit quantized asymmetric vector with 24 elements */
+using qasymm8x8x4_t = uint8x8x4_t; /**< 8 bit quantized asymmetric vector with 32 elements */
+using qasymm8x16_t  = uint8x16_t;  /**< 8 bit quantized asymmetric vector with 16 elements */
+
+using qasymm8x8_signed_t   = int8x8_t;   /**< 8 bit quantized signed asymmetric vector with 8 elements */
+using qasymm8x8x2_signed_t = int8x8x2_t; /**< 8 bit quantized signed asymmetric vector with 16 elements */
+using qasymm8x8x3_signed_t = int8x8x3_t; /**< 8 bit quantized signed asymmetric vector with 24 elements */
+using qasymm8x8x4_signed_t = int8x8x4_t; /**< 8 bit quantized signed asymmetric vector with 32 elements */
+using qasymm8x16_signed_t  = int8x16_t;  /**< 8 bit quantized signed asymmetric vector with 16 elements */
+
+/** Perform a multiply-accumulate on all 16 components of a QASYMM8 vector
+ *
+ * vd*vs + vo
+ *
+ * @param[in] vd Input vector value in QASYMM8 format
+ * @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes.
+ * @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes.
+ *
+ * @return A 16-component vector in QASYMM8 format, saturated to fit
+ */
+uint8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo);
+
+/** Perform a multiply-accumulate on all 16 components of a QASYMM8_SIGNED vector
+ *
+ * vd*vs + vo
+ *
+ * @param[in] vd Input vector value in QASYMM8_SIGNED format
+ * @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes.
+ * @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes.
+ *
+ * @return A 16-component vector in QASYMM8_SIGNED format, saturated to fit
+ */
+int8x16_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo);
+
+/** Performs final quantization step on 16 elements
+ *
+ * @param[in] in_s32                        Input to be quantized.
+ * @param[in] result_fixedpoint_multiplier  Result multiplier parameter
+ * @param[in] result_shift                  Result shift parameter
+ * @param[in] result_offset_after_shift_s32 Result offset parameter
+ * @param[in] min_u8                        Relu lower bound
+ * @param[in] max_u8                        Relu upper bound
+ * @param[in] is_bounded_relu               Specified if a fused bounded relu should be applied
+ *
+ * @return Quantized values
+ */
+inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32,
+                                        int          result_fixedpoint_multiplier,
+                                        int32_t      result_shift,
+                                        int32x4_t    result_offset_after_shift_s32,
+                                        uint8x16_t   min_u8,
+                                        uint8x16_t   max_u8,
+                                        bool         is_bounded_relu)
+{
+    const static int32x4_t zero_s32 = vdupq_n_s32(0);
+
+    if(result_shift < 0)
+    {
+        in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift)));
+        in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift)));
+        in_s32.val[2] = vmulq_n_s32(in_s32.val[2], (1 << (-result_shift)));
+        in_s32.val[3] = vmulq_n_s32(in_s32.val[3], (1 << (-result_shift)));
+
+        in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
+        in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
+        in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
+        in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
+    }
+    else
+    {
+        // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
+        in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
+        in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
+        in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
+        in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
+
+        // Round to the nearest division by a power-of-two using result_shift_s32
+        in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift);
+        in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift);
+        in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift);
+        in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift);
+    }
+
+    // Add the offset terms
+    in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
+    in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
+    in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
+    in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
+
+    // Saturate negative values
+    in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
+    in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
+    in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
+    in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
+
+    // Convert S32 to S16
+    const int16x8x2_t in_s16 =
+    {
+        {
+            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
+        }
+    };
+
+    // Convert S16 to U8
+    uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
+
+    if(is_bounded_relu)
+    {
+        out_u8 = vmaxq_u8(out_u8, min_u8);
+        out_u8 = vminq_u8(out_u8, max_u8);
+    }
+
+    return out_u8;
+}
+
+/** Performs final quantization step on 16 elements
+ *
+ * @param[in] in_s32                        Input to be quantized.
+ * @param[in] result_fixedpoint_multiplier  Result multiplier parameter
+ * @param[in] result_shift                  Result shift parameter
+ * @param[in] result_offset_after_shift_s32 Result offset parameter
+ * @param[in] min_s8                        Relu lower bound
+ * @param[in] max_s8                        Relu upper bound
+ * @param[in] is_bounded_relu               Specified if a fused bounded relu should be applied
+ *
+ * @return Quantized values
+ */
+inline int8x16_t finalize_quantization(int32x4x4_t &in_s32,
+                                       int          result_fixedpoint_multiplier,
+                                       int32_t      result_shift,
+                                       int32x4_t    result_offset_after_shift_s32,
+                                       int8x16_t    min_s8,
+                                       int8x16_t    max_s8,
+                                       bool         is_bounded_relu)
+{
+    if(result_shift < 0)
+    {
+        in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift)));
+        in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift)));
+        in_s32.val[2] = vmulq_n_s32(in_s32.val[2], (1 << (-result_shift)));
+        in_s32.val[3] = vmulq_n_s32(in_s32.val[3], (1 << (-result_shift)));
+
+        in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
+        in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
+        in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
+        in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
+    }
+    else
+    {
+        // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
+        in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
+        in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
+        in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
+        in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
+
+        // Round to the nearest division by a power-of-two using result_shift_s32
+        in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift);
+        in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift);
+        in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift);
+        in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift);
+    }
+
+    // Add the offset terms
+    in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
+    in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
+    in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
+    in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
+
+    // Convert S32 to S16
+    const int16x8x2_t in_s16 =
+    {
+        {
+            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
+        }
+    };
+
+    // Convert S16 to S8
+    int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
+
+    if(is_bounded_relu)
+    {
+        out_s8 = vmaxq_s8(out_s8, min_s8);
+        out_s8 = vminq_s8(out_s8, max_s8);
+    }
+
+    return out_s8;
+}
+
+/** Performs final quantization step on 16 elements for symmetric quantization
+ *
+ * @param[in] in_s32                        Input to be quantized.
+ * @param[in] result_fixedpoint_multiplier  Result multiplier parameter
+ * @param[in] result_shift                  Result shift parameter
+ * @param[in] result_offset_after_shift_s32 Result offset parameter
+ * @param[in] min_s8                        Relu lower bound
+ * @param[in] max_s8                        Relu upper bound
+ * @param[in] is_bounded_relu               Specified if a fused bounded relu should be applied
+ *
+ * @return Quantized values
+ */
+inline int8x16_t finalize_quantization_symm(int32x4x4_t       &in_s32,
+                                            const int32x4x4_t &result_fixedpoint_multiplier,
+                                            const int32x4x4_t &result_shift,
+                                            const int32x4_t   &result_offset_after_shift_s32,
+                                            const int8x16_t   &min_s8,
+                                            const int8x16_t   &max_s8,
+                                            const bool         is_bounded_relu)
+{
+    const static int32x4_t one_s32 = vdupq_n_s32(1);
+
+    // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
+    int32x4x4_t res_shift_gt0 =
+    {
+        vqrdmulhq_s32(in_s32.val[0], result_fixedpoint_multiplier.val[0]),
+        vqrdmulhq_s32(in_s32.val[1], result_fixedpoint_multiplier.val[1]),
+        vqrdmulhq_s32(in_s32.val[2], result_fixedpoint_multiplier.val[2]),
+        vqrdmulhq_s32(in_s32.val[3], result_fixedpoint_multiplier.val[3]),
+    };
+    // Round to the nearest division by a power-of-two using result_shift_s32
+    res_shift_gt0.val[0] = rounding_divide_by_pow2(res_shift_gt0.val[0], result_shift.val[0]);
+    res_shift_gt0.val[1] = rounding_divide_by_pow2(res_shift_gt0.val[1], result_shift.val[1]);
+    res_shift_gt0.val[2] = rounding_divide_by_pow2(res_shift_gt0.val[2], result_shift.val[2]);
+    res_shift_gt0.val[3] = rounding_divide_by_pow2(res_shift_gt0.val[3], result_shift.val[3]);
+
+    int32x4x4_t res_shift_lt0 =
+    {
+        vmulq_s32(in_s32.val[0], vshlq_s32(one_s32, vnegq_s32(result_shift.val[0]))),
+        vmulq_s32(in_s32.val[1], vshlq_s32(one_s32, vnegq_s32(result_shift.val[1]))),
+        vmulq_s32(in_s32.val[2], vshlq_s32(one_s32, vnegq_s32(result_shift.val[2]))),
+        vmulq_s32(in_s32.val[3], vshlq_s32(one_s32, vnegq_s32(result_shift.val[3]))),
+    };
+    res_shift_lt0.val[0] = vqrdmulhq_s32(res_shift_lt0.val[0], result_fixedpoint_multiplier.val[0]);
+    res_shift_lt0.val[1] = vqrdmulhq_s32(res_shift_lt0.val[1], result_fixedpoint_multiplier.val[1]);
+    res_shift_lt0.val[2] = vqrdmulhq_s32(res_shift_lt0.val[2], result_fixedpoint_multiplier.val[2]);
+    res_shift_lt0.val[3] = vqrdmulhq_s32(res_shift_lt0.val[3], result_fixedpoint_multiplier.val[3]);
+
+    // Select result depending on shift value
+    const uint32x4x4_t mask_lt0 =
+    {
+#ifdef __aarch64__
+        vcltzq_s32(result_shift.val[0]),
+        vcltzq_s32(result_shift.val[1]),
+        vcltzq_s32(result_shift.val[2]),
+        vcltzq_s32(result_shift.val[3]),
+#else  //__aarch64__
+        vcltq_s32(result_shift.val[0], vdupq_n_s32(0)),
+        vcltq_s32(result_shift.val[1], vdupq_n_s32(0)),
+        vcltq_s32(result_shift.val[2], vdupq_n_s32(0)),
+        vcltq_s32(result_shift.val[3], vdupq_n_s32(0)),
+#endif //__aarch64__
+    };
+
+    in_s32.val[0] = vbslq_s32(mask_lt0.val[0], res_shift_lt0.val[0], res_shift_gt0.val[0]);
+    in_s32.val[1] = vbslq_s32(mask_lt0.val[1], res_shift_lt0.val[1], res_shift_gt0.val[1]);
+    in_s32.val[2] = vbslq_s32(mask_lt0.val[2], res_shift_lt0.val[2], res_shift_gt0.val[2]);
+    in_s32.val[3] = vbslq_s32(mask_lt0.val[3], res_shift_lt0.val[3], res_shift_gt0.val[3]);
+
+    // Add the offset terms
+    in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
+    in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
+    in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
+    in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
+
+    // Convert S32 to S16
+    const int16x8x2_t in_s16 =
+    {
+        {
+            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
+        }
+    };
+
+    // Convert S16 to S8
+    int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
+
+    if(is_bounded_relu)
+    {
+        out_s8 = vmaxq_s8(out_s8, min_s8);
+        out_s8 = vminq_s8(out_s8, max_s8);
+    }
+
+    return out_s8;
+}
+
+/** Performs final quantization step on single element
+ *
+ * @param[in] in_value                      Input to be quantized.
+ * @param[in] result_fixedpoint_multiplier  Result multiplier parameter
+ * @param[in] result_shift                  Result shift parameter
+ * @param[in] result_offset_after_shift_s32 Result offset parameter
+ * @param[in] min_u8                        Relu lower bound
+ * @param[in] max_u8                        Relu upper bound
+ * @param[in] is_bounded_relu               Specified if a fused bounded relu should be applied
+ *
+ * @return Quantized value
+ */
+inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
+                                     int32_t result_shift, int32_t result_offset_after_shift_s32,
+                                     uint8_t min_u8, uint8_t max_u8, bool is_bounded_relu)
+{
+    int32x4_t in_s32 = vdupq_n_s32(in_value);
+
+    if(result_shift < 0)
+    {
+        in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
+    }
+    else
+    {
+        // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
+        in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0);
+        // Shift value by result_shift_s32
+        in_value = rounding_divide_by_pow2(in_value, result_shift);
+    }
+
+    // Add the offset term
+    in_value += result_offset_after_shift_s32;
+
+    // Bound the result
+    uint8_t out_u8 = static_cast<uint8_t>(std::max<int32_t>(0, std::min<int32_t>(255, in_value)));
+    if(is_bounded_relu)
+    {
+        out_u8 = static_cast<uint8_t>(std::max(min_u8, std::min(max_u8, out_u8)));
+    }
+
+    return out_u8;
+}
+
+/** Performs final quantization step on single element
+ *
+ * @param[in] in_value                      Input to be quantized.
+ * @param[in] result_fixedpoint_multiplier  Result multiplier parameter
+ * @param[in] result_shift                  Result shift parameter
+ * @param[in] result_offset_after_shift_s32 Result offset parameter
+ * @param[in] min_s8                        Relu lower bound
+ * @param[in] max_s8                        Relu upper bound
+ * @param[in] is_bounded_relu               Specified if a fused bounded relu should be applied
+ *
+ * @return Quantized value
+ */
+inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
+                                    int32_t result_shift, int32_t result_offset_after_shift_s32,
+                                    int8_t min_s8, int8_t max_s8, bool is_bounded_relu)
+{
+    int32x4_t in_s32 = vdupq_n_s32(in_value);
+
+    if(result_shift < 0)
+    {
+        in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
+    }
+    else
+    {
+        // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
+        in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0);
+
+        // Shift value by result_shift_s32
+        in_value = rounding_divide_by_pow2(in_value, result_shift);
+    }
+
+    // Add the offset term
+    in_value += result_offset_after_shift_s32;
+
+    // Bound the result
+    int8_t out_s8 = static_cast<int8_t>(std::max<int32_t>(-128, std::min<int32_t>(127, in_value)));
+    if(is_bounded_relu)
+    {
+        out_s8 = static_cast<int8_t>(std::max(min_s8, std::min(max_s8, out_s8)));
+    }
+
+    return out_s8;
+}
+
+/** Dequantize a neon vector holding 8 quantized values.
+ *
+ * @param[in] qv Input values to be dequantized.
+ * @param[in] qi Quantization information to be used in the computation.
+ *
+ * @return Dequantized values in a neon vector
+ */
+inline float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi)
+{
+    const float         scale   = qi.scale;
+    const int           offset  = qi.offset;
+    const int32x4_t     voffset = vdupq_n_s32(offset);
+    const float32x4_t   vscale  = vdupq_n_f32(scale);
+    const float32x4x2_t vdequantized_input =
+    {
+        {
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(qv)))), voffset)), vscale),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(qv)))), voffset)), vscale),
+        }
+    };
+    return vdequantized_input;
+}
+
+/** Dequantize a neon vector holding 8 singed quantized values.
+ *
+ * @param[in] qv Input values to be dequantized.
+ * @param[in] qi Quantization information to be used in the computation.
+ *
+ * @return Dequantized values in a neon vector
+ */
+inline float32x4x2_t vdequantize(const int8x8_t &qv, const UniformQuantizationInfo &qi)
+{
+    const float         scale   = qi.scale;
+    const int           offset  = qi.offset;
+    const int32x4_t     voffset = vdupq_n_s32(offset);
+    const float32x4_t   vscale  = vdupq_n_f32(scale);
+    const float32x4x2_t vdequantized_input =
+    {
+        {
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(qv))), voffset)), vscale),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(qv))), voffset)), vscale),
+        }
+    };
+    return vdequantized_input;
+}
+
+/** Dequantize a neon vector holding 16 quantized values.
+ *
+ * @param[in] qv Input values to be dequantized.
+ * @param[in] qi Quantization information to be used in the computation.
+ *
+ * @return Dequantized values in a neon vector
+ */
+inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantizationInfo &qi)
+{
+    const float         scale   = qi.scale;
+    const int           offset  = qi.offset;
+    const int32x4_t     voffset = vdupq_n_s32(offset);
+    const float32x4_t   vscale  = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input =
+    {
+        {
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
+        }
+    };
+    return vdequantized_input;
+}
+
+/** Dequantize a neon vector holding 16 signed quantized values.
+ *
+ * @param[in] qv Input values to be dequantized.
+ * @param[in] qi Quantization information to be used in the computation.
+ *
+ * @return Dequantized values in a neon vector
+ */
+inline float32x4x4_t vdequantize(const int8x16_t &qv, const UniformQuantizationInfo &qi)
+{
+    const float         scale   = qi.scale;
+    const int           offset  = qi.offset;
+    const int32x4_t     voffset = vdupq_n_s32(offset);
+    const float32x4_t   vscale  = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input =
+    {
+        {
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
+        }
+    };
+    return vdequantized_input;
+}
+
+/** Dequantize following an asymmetric quantization scheme a neon vector holding 16 quantized values.
+ *
+ * @param[in] qv     Input values to be dequantized.
+ * @param[in] scale  Quantization scaling factor.
+ * @param[in] offset Zero quantization offset.
+ *
+ * @return Dequantized values in a neon vector
+ */
+inline float32x4x4_t vdequantize(const uint8x16_t &qv, float scale, int32_t offset)
+{
+    const int32x4_t     voffset = vdupq_n_s32(offset);
+    const float32x4_t   vscale  = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input =
+    {
+        {
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
+        }
+    };
+    return vdequantized_input;
+}
+
+/** Dequantize a vector of 16 values stored as signed asymmetric.
+ *
+ * @param[in] qv     Input values to be dequantized.
+ * @param[in] scale  Quantization scaling factor.
+ * @param[in] offset Zero quantization offset.
+ *
+ * @return Dequantized values in a neon vector
+ */
+inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale, int32_t offset)
+{
+    const int32x4_t     voffset = vdupq_n_s32(offset);
+    const float32x4_t   vscale  = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input =
+    {
+        {
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
+        }
+    };
+    return vdequantized_input;
+}
+
+/** Dequantize following symmetric quantization scheme a neon vector holding 16 quantized values.
+ *
+ * @param[in] qv     Input values to be dequantized.
+ * @param[in] vscale Vector containing quantization scaling factors.
+ *
+ * @return Dequantized values in a neon vector
+ */
+inline float32x4x4_t vdequantize(const int8x16_t &qv, const float32x4x4_t vscale)
+{
+    const float32x4x4_t vdequantized_input =
+    {
+        {
+            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[0]),
+            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[1]),
+            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[2]),
+            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[3]),
+        }
+    };
+    return vdequantized_input;
+}
+
+/** Dequantize following a symmetric quantization scheme a neon vector holding 16 quantized values.
+ *
+ * @param[in] qv    Input values to be dequantized.
+ * @param[in] scale Quantization scaling factor.
+ *
+ * @return Dequantized values in a neon vector
+ */
+inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale)
+{
+    const float32x4_t   vscale = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input =
+    {
+        {
+            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
+            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
+            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
+            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
+        }
+    };
+    return vdequantized_input;
+}
+
+/** Quantize a neon vector holding 8 floating point values.
+ *
+ * @param[in] qv Input values to be quantized.
+ * @param[in] qi Quantization information to be used in the computation.
+ *
+ * @return A neon vector holding the quantized values
+ */
+inline uint8x8_t vquantize(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
+{
+    const float       scale     = qi.scale;
+    const int         offset    = qi.offset;
+    const float32x4_t voffset   = vdupq_n_f32(offset);
+    const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
+    const int32x4x4_t rf =
+    {
+        {
+#ifdef __aarch64__
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+#else  //__aarch64__
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+#endif //__aarch64__
+        }
+    };
+    return vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
+}
+
+/** Quantize a neon vector holding 8 floating point values.
+ *
+ * @param[in] qv Input values to be quantized.
+ * @param[in] qi Quantization information to be used in the computation.
+ *
+ * @return A neon vector holding the singed quantized values
+ */
+inline int8x8_t vquantize_signed(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
+{
+    const float       scale     = qi.scale;
+    const int         offset    = qi.offset;
+    const float32x4_t voffset   = vdupq_n_f32(offset);
+    const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
+    const int32x4x4_t rf =
+    {
+        {
+#ifdef __aarch64__
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+#else  //__aarch64__
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+#endif //__aarch64__
+        }
+    };
+    return vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
+}
+
+/** Quantize a neon vector holding 16 floating point values.
+ *
+ * @param[in] qv Input values to be quantized.
+ * @param[in] qi Quantization information to be used in the computation.
+ *
+ * @return A neon vector holding the quantized values
+ */
+inline uint8x16_t vquantize(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
+{
+    const float       scale     = qi.scale;
+    const int         offset    = qi.offset;
+    const float32x4_t voffset   = vdupq_n_f32(offset);
+    const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
+    const int32x4x4_t rf =
+    {
+        {
+#ifdef __aarch64__
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
+#else  //__aarch64__
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
+#endif //__aarch64__
+        }
+    };
+    const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
+    const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
+    return vcombine_u8(pa, pb);
+}
+
+/** Signed quantize a neon vector holding 16 floating point values.
+ *
+ * @param[in] qv Input values to be quantized.
+ * @param[in] qi Quantization information to be used in the computation.
+ *
+ * @return A neon vector holding the quantized values
+ */
+inline int8x16_t vquantize_signed(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
+{
+    const float       scale     = qi.scale;
+    const int         offset    = qi.offset;
+    const float32x4_t voffset   = vdupq_n_f32(offset);
+    const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
+    const int32x4x4_t rf =
+    {
+        {
+#ifdef __aarch64__
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
+#else  //__aarch64__
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
+#endif //__aarch64__
+        }
+    };
+    const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
+    const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
+    return vcombine_s8(pa, pb);
+}
+
+/** Quantize to QASYMM16 a neon vector holding 16 floating point values.
+ *
+ * @param[in] qv Input values to be quantized.
+ * @param[in] qi Quantization information to be used in the computation.
+ *
+ * @return A neon vector holding the quantized values
+ */
+inline uint16x8x2_t vquantize_qasymm16(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
+{
+    const float       scale     = qi.scale;
+    const int         offset    = qi.offset;
+    const float32x4_t voffset   = vdupq_n_f32(offset);
+    const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
+    const int32x4x4_t rf =
+    {
+        {
+#ifdef __aarch64__
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
+#else  //__aarch64__
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
+#endif //__aarch64__
+        }
+    };
+    const uint16x8_t pa = vcombine_u16(vqmovun_s32(rf.val[0]), vqmovun_s32(rf.val[1]));
+    const uint16x8_t pb = vcombine_u16(vqmovun_s32(rf.val[2]), vqmovun_s32(rf.val[3]));
+    return { pa, pb };
+}
+} // namespace arm_compute
+#include "src/core/NEON/NEAsymm.inl"
+#endif // ARM_COMPUTE_NEASYMM_H
diff --git a/src/core/NEON/NEAsymm.inl b/src/core/NEON/NEAsymm.inl
new file mode 100644
index 0000000..6ee1a33
--- /dev/null
+++ b/src/core/NEON/NEAsymm.inl
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+namespace arm_compute
+{
+inline qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo)
+{
+    // Convert uint8 vectors to uint16 vectors
+    const uint8x8_t vd_low        = vget_low_u8(vd);
+    const uint8x8_t vd_high       = vget_high_u8(vd);
+    uint16x8_t      vd_low_u16x8  = vmovl_u8(vd_low);
+    uint16x8_t      vd_high_u16x8 = vmovl_u8(vd_high);
+    // Convert uint16 vectors to uint32 vectors
+    uint32x4_t A_u32x4 = vmovl_u16(vget_low_u16(vd_low_u16x8));
+    uint32x4_t B_u32x4 = vmovl_u16(vget_high_u16(vd_low_u16x8));
+    uint32x4_t C_u32x4 = vmovl_u16(vget_low_u16(vd_high_u16x8));
+    uint32x4_t D_u32x4 = vmovl_u16(vget_high_u16(vd_high_u16x8));
+    // Convert uint32 vectors to float32 vectors
+    float32x4_t A_f32x4 = vcvtq_f32_u32(A_u32x4);
+    float32x4_t B_f32x4 = vcvtq_f32_u32(B_u32x4);
+    float32x4_t C_f32x4 = vcvtq_f32_u32(C_u32x4);
+    float32x4_t D_f32x4 = vcvtq_f32_u32(D_u32x4);
+    // vd = vd*vs + vo
+    A_f32x4 = vmlaq_f32(vo, A_f32x4, vs);
+    B_f32x4 = vmlaq_f32(vo, B_f32x4, vs);
+    C_f32x4 = vmlaq_f32(vo, C_f32x4, vs);
+    D_f32x4 = vmlaq_f32(vo, D_f32x4, vs);
+    // Convert float32 vectors to uint32 vectors
+    A_u32x4 = vcvtq_u32_f32(A_f32x4);
+    B_u32x4 = vcvtq_u32_f32(B_f32x4);
+    C_u32x4 = vcvtq_u32_f32(C_f32x4);
+    D_u32x4 = vcvtq_u32_f32(D_f32x4);
+    // Convert uint32 vectors to uint16 vectors (with saturation)
+    vd_low_u16x8  = vcombine_u16(vqmovn_u32(A_u32x4), vqmovn_u32(B_u32x4));
+    vd_high_u16x8 = vcombine_u16(vqmovn_u32(C_u32x4), vqmovn_u32(D_u32x4));
+    // convert uint16 vectors to uint8 vectors (with saturation)
+    return vcombine_u8(vqmovn_u16(vd_low_u16x8), vqmovn_u16(vd_high_u16x8));
+}
+inline qasymm8x16_signed_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo)
+{
+    // Convert uint8 vectors to int16 vectors
+    const int8x8_t vd_low        = vget_low_s8(vd);
+    const int8x8_t vd_high       = vget_high_s8(vd);
+    int16x8_t      vd_low_s16x8  = vmovl_s8(vd_low);
+    int16x8_t      vd_high_s16x8 = vmovl_s8(vd_high);
+    // Convert int16 vectors to int32 vectors
+    int32x4_t A_s32x4 = vmovl_s16(vget_low_s16(vd_low_s16x8));
+    int32x4_t B_s32x4 = vmovl_s16(vget_high_s16(vd_low_s16x8));
+    int32x4_t C_s32x4 = vmovl_s16(vget_low_s16(vd_high_s16x8));
+    int32x4_t D_s32x4 = vmovl_s16(vget_high_s16(vd_high_s16x8));
+    // Convert int32 vectors to float32 vectors
+    float32x4_t A_f32x4 = vcvtq_f32_s32(A_s32x4);
+    float32x4_t B_f32x4 = vcvtq_f32_s32(B_s32x4);
+    float32x4_t C_f32x4 = vcvtq_f32_s32(C_s32x4);
+    float32x4_t D_f32x4 = vcvtq_f32_s32(D_s32x4);
+    // vd = vd*vs + vo
+    A_f32x4 = vmlaq_f32(vo, A_f32x4, vs);
+    B_f32x4 = vmlaq_f32(vo, B_f32x4, vs);
+    C_f32x4 = vmlaq_f32(vo, C_f32x4, vs);
+    D_f32x4 = vmlaq_f32(vo, D_f32x4, vs);
+    // Convert float32 vectors to int32 vectors
+    A_s32x4 = vcvtq_s32_f32(A_f32x4);
+    B_s32x4 = vcvtq_s32_f32(B_f32x4);
+    C_s32x4 = vcvtq_s32_f32(C_f32x4);
+    D_s32x4 = vcvtq_s32_f32(D_f32x4);
+    // Convert int32 vectors to int16 vectors (with saturation)
+    vd_low_s16x8  = vcombine_s16(vqmovn_s32(A_s32x4), vqmovn_s32(B_s32x4));
+    vd_high_s16x8 = vcombine_s16(vqmovn_s32(C_s32x4), vqmovn_s32(D_s32x4));
+    // convert int16 vectors to int8 vectors (with saturation)
+    return vcombine_s8(vqmovn_s16(vd_low_s16x8), vqmovn_s16(vd_high_s16x8));
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/NEFixedPoint.h b/src/core/NEON/NEFixedPoint.h
new file mode 100644
index 0000000..5c49b25
--- /dev/null
+++ b/src/core/NEON/NEFixedPoint.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEFIXEDPOINT_H
+#define ARM_COMPUTE_NEFIXEDPOINT_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+/** Compute lane-by-lane maximum between elements of a float vector with 4x2 elements
+ *
+ * @param[in] a Float input vector
+ * @param[in] b Float input vector
+ *
+ * @return The lane-by-lane maximum -> float32x4x2
+ */
+float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b);
+} // namespace arm_compute
+#include "src/core/NEON/NEFixedPoint.inl"
+#endif /* ARM_COMPUTE_NEFIXEDPOINT_H */
\ No newline at end of file
diff --git a/src/core/NEON/NEFixedPoint.inl b/src/core/NEON/NEFixedPoint.inl
new file mode 100644
index 0000000..8bff9c4
--- /dev/null
+++ b/src/core/NEON/NEFixedPoint.inl
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <array>
+#include <limits>
+
+namespace arm_compute
+{
+#ifndef DOXYGEN_SKIP_THIS
+
+inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
+{
+    float32x4x2_t res =
+    {
+        {
+            vmaxq_f32(a.val[0], b.val[0]),
+            vmaxq_f32(a.val[1], b.val[1])
+        }
+    };
+    return res;
+}
+#endif /* DOXYGEN_SKIP_THIS */
+} // namespace arm_compute
diff --git a/src/core/NEON/NEMath.h b/src/core/NEON/NEMath.h
new file mode 100644
index 0000000..877ffb2
--- /dev/null
+++ b/src/core/NEON/NEMath.h
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEMATH_H
+#define ARM_COMPUTE_NEMATH_H
+
+#include <arm_neon.h>
+#include <array>
+
+namespace arm_compute
+{
+/** Calculate floor of a vector.
+ *
+ * @param[in] val Input vector value in F32 format.
+ *
+ * @return The calculated floor vector.
+ */
+float32x4_t vfloorq_f32(float32x4_t val);
+
+/** Calculate round value of a vector to nearest with ties to even.
+ *
+ * @param[in] val Input vector value in F32 format.
+ *
+ * @return The calculated round vector.
+ */
+float32x4_t vroundq_rte_f32(float32x4_t val);
+
+/** Calculate inverse square root.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated inverse square root.
+ */
+float32x2_t vinvsqrt_f32(float32x2_t x);
+
+/** Calculate inverse square root.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated inverse square root.
+ */
+float32x4_t vinvsqrtq_f32(float32x4_t x);
+
+/** Calculate reciprocal.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated reciprocal.
+ */
+float32x2_t vinv_f32(float32x2_t x);
+
+/** Calculate reciprocal.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated reciprocal.
+ */
+float32x4_t vinvq_f32(float32x4_t x);
+
+/** Perform a 7th degree polynomial approximation using Estrin's method.
+ *
+ * @param[in] x      Input vector value in F32 format.
+ * @param[in] coeffs Polynomial coefficients table.
+ *
+ * @return The calculated approximation.
+ */
+float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array<float32x4_t, 8> &coeffs);
+
+/** Calculate exponential
+ *
+ * @param[in] x Input vector value in F32 format.
+ *
+ * @return The calculated exponent.
+ */
+float32x4_t vexpq_f32(float32x4_t x);
+
+/** Calculate logarithm
+ *
+ * @param[in] x Input vector value in F32 format.
+ *
+ * @return The calculated logarithm.
+ */
+float32x4_t vlogq_f32(float32x4_t x);
+
+/** Calculate hyperbolic tangent.
+ *
+ * tanh(x) = (e^2x - 1)/(e^2x + 1)
+ *
+ * @note We clamp x to [-5,5] to avoid overflowing issues.
+ *
+ * @param[in] val Input vector value in F32 format.
+ *
+ * @return The calculated Hyperbolic Tangent.
+ */
+float32x4_t vtanhq_f32(float32x4_t val);
+
+/** Calculate n power of a number.
+ *
+ * pow(x,n) = e^(n*log(x))
+ *
+ * @param[in] val Input vector value in F32 format.
+ * @param[in] n   Powers to raise the input to.
+ *
+ * @return The calculated power.
+ */
+float32x4_t vpowq_f32(float32x4_t val, float32x4_t n);
+
+/** Round to the nearest division by a power-of-two using exponent
+ *
+ * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent
+ *
+ * @param[in] x        Vector of 4 elements
+ * @param[in] exponent Vector of 4 elements with integer value used to round to nearest division by a power-of-two
+ *
+ * @return the nearest division by a power-of-two using exponent
+ */
+int32x4_t rounding_divide_by_pow2(int32x4_t x, int32x4_t exponent);
+
+/** Round to the nearest division by a power-of-two using exponent
+ *
+ * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent
+ *
+ * @param[in] x        Vector of 4 elements
+ * @param[in] exponent Integer value used to round to nearest division by a power-of-two
+ *
+ * @return the nearest division by a power-of-two using exponent
+ */
+int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent);
+
+/** Round to the nearest division by a power-of-two using exponent
+ *
+ * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent
+ *
+ * @param[in] x        Element to divide.
+ * @param[in] exponent Integer value used to round to nearest division by a power-of-two
+ *
+ * @return the nearest division by a power-of-two using exponent
+ */
+int32_t rounding_divide_by_pow2(int32_t x, int exponent);
+
+/** Converts from uint8x16 to float32x4x4_t
+ *
+ * @param[in] in Vector of uint8 to be converted
+ *
+ * @return Converted vector of float
+ */
+float32x4x4_t convert_uint8x16_to_float32x4x4(const uint8x16_t &in);
+
+/** Converts from int8x16 to float32x4x4_t
+ *
+ * @param[in] in Vector of int8 to be converted
+ *
+ * @return Converted vector of float
+ */
+float32x4x4_t convert_int8x16_to_float32x4x4(const int8x16_t &in);
+
+/** Converts to float32x4x4_t from the specified templated 16 elements vectors
+ *
+ * @param[in] in Vector of float to be converted
+ *
+ * @return Converted vector of float
+ */
+template <typename T>
+float32x4x4_t convert_to_float32x4x4(const T &in);
+
+/** Converts from two float32x4x3_t to just one uint8x8x3_t
+ *
+ * @param[in]  in1 First input vector of float to be converted
+ * @param[in]  in2 Second input vector of float to be converted
+ * @param[out] out Converted output vector uint8 to store the result
+ */
+void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x3_t &in2, uint8x8x3_t &out);
+
+/** Converts from two float32x4x4_t to just one uint8x16_t
+ *
+ * @param[in]  in  Vector of float to be converted
+ * @param[out] out Converted vector of uint8 to store the result
+ */
+void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out);
+
+/** Converts from float32x4x4_t to just one int8x16_t
+ *
+ * @param[in]  in  Vector of float to be converted
+ * @param[out] out Converted vector of uint8 to store the result
+ */
+void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out);
+
+/** Calculate sine.
+ *
+ * @param[in] val Input vector value in radians, F32 format.
+ *
+ * @return The calculated sine.
+ */
+float32x4_t vsinq_f32(float32x4_t val);
+
+/** Calculate sine.
+ *
+ * @param[in] val Input vector value in radians, F32 format.
+ *
+ * @return The calculated sine.
+ */
+float32x2_t vsin_f32(float32x2_t val);
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+/** Calculate hyperbolic tangent.
+ *
+ * tanh(x) = (e^2x - 1)/(e^2x + 1)
+ *
+ * @note We clamp x to [-5,5] to avoid overflowing issues.
+ *
+ * @param[in] val Input vector value in F16 format.
+ *
+ * @return The calculated Hyperbolic Tangent.
+ */
+float16x8_t vtanhq_f16(float16x8_t val);
+
+/** Calculate round value of a vector to nearest with ties to even.
+ *
+ * @param[in] val Input vector value in F16 format.
+ *
+ * @return The calculated round vector.
+ */
+float16x8_t vroundq_rte_f16(float16x8_t val);
+
+/** Calculate reciprocal.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated reciprocal.
+ */
+float16x4_t vinv_f16(float16x4_t x);
+
+/** Calculate reciprocal.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated reciprocal.
+ */
+float16x8_t vinvq_f16(float16x8_t x);
+
+/** Calculate inverse square root.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated inverse square root.
+ */
+float16x4_t vinvsqrt_f16(float16x4_t x);
+
+/** Calculate inverse square root.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated inverse square root.
+ */
+float16x8_t vinvsqrtq_f16(float16x8_t x);
+
+/** Calculate exponential
+ *
+ * @param[in] x Input vector value in F16 format.
+ *
+ * @return The calculated exponent.
+ */
+float16x8_t vexpq_f16(float16x8_t x);
+
+/** Calculate n power of a number.
+ *
+ * pow(x,n) = e^(n*log(x))
+ *
+ * @param[in] val Input vector value in F16 format.
+ * @param[in] n   Powers to raise the input to.
+ *
+ * @return The calculated power.
+ */
+float16x8_t vpowq_f16(float16x8_t val, float16x8_t n);
+
+/** Calculate sine.
+ *
+ * @param[in] val Input vector value in radians, F16 format.
+ *
+ * @return The calculated sine.
+ */
+float16x8_t vsinq_f16(float16x8_t val);
+
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+} // namespace arm_compute
+#include "src/core/NEON/NEMath.inl"
+#endif /* ARM_COMPUTE_NEMATH_H */
diff --git a/src/core/NEON/NEMath.inl b/src/core/NEON/NEMath.inl
new file mode 100644
index 0000000..a1c3d41
--- /dev/null
+++ b/src/core/NEON/NEMath.inl
@@ -0,0 +1,529 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <cmath>
+#include <limits>
+
+#ifndef M_PI
+#define M_PI (3.14159265358979323846)
+#endif // M_PI
+
+namespace arm_compute
+{
+/** Exponent polynomial coefficients */
+const std::array<float32x4_t, 8> exp_tab =
+{
+    {
+        vdupq_n_f32(1.f),
+        vdupq_n_f32(0.0416598916054f),
+        vdupq_n_f32(0.500000596046f),
+        vdupq_n_f32(0.0014122662833f),
+        vdupq_n_f32(1.00000011921f),
+        vdupq_n_f32(0.00833693705499f),
+        vdupq_n_f32(0.166665703058f),
+        vdupq_n_f32(0.000195780929062f),
+    }
+};
+
+/** Logarithm polynomial coefficients */
+const std::array<float32x4_t, 8> log_tab =
+{
+    {
+        vdupq_n_f32(-2.29561495781f),
+        vdupq_n_f32(-2.47071170807f),
+        vdupq_n_f32(-5.68692588806f),
+        vdupq_n_f32(-0.165253549814f),
+        vdupq_n_f32(5.17591238022f),
+        vdupq_n_f32(0.844007015228f),
+        vdupq_n_f32(4.58445882797f),
+        vdupq_n_f32(0.0141278216615f),
+    }
+};
+
+/** Sin polynomial coefficients */
+constexpr float te_sin_coeff2 = 0.166666666666f; // 1/(2*3)
+constexpr float te_sin_coeff3 = 0.05f;           // 1/(4*5)
+constexpr float te_sin_coeff4 = 0.023809523810f; // 1/(6*7)
+constexpr float te_sin_coeff5 = 0.013888888889f; // 1/(8*9)
+
+#ifndef DOXYGEN_SKIP_THIS
+inline float32x4_t vfloorq_f32(float32x4_t val)
+{
+    static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
+
+    const int32x4_t   z = vcvtq_s32_f32(val);
+    const float32x4_t r = vcvtq_f32_s32(z);
+
+    return vbslq_f32(vcgtq_f32(r, val), vsubq_f32(r, CONST_1), r);
+}
+
+inline float32x4_t vroundq_rte_f32(float32x4_t val)
+{
+#ifdef __aarch64__
+    return vrndnq_f32(val);
+#else  // __aarch64__
+    static const float32x4_t CONST_HALF_FLOAT = vdupq_n_f32(0.5f);
+    static const float32x4_t CONST_1_FLOAT    = vdupq_n_f32(1.f);
+    static const int32x4_t   CONST_1_INT      = vdupq_n_s32(1);
+    const float32x4_t        floor_val        = vfloorq_f32(val);
+    const float32x4_t        diff             = vsubq_f32(val, floor_val);
+
+    /*
+    * Select the floor value when (diff<0.5 || (diff==0.5 && floor_val%2==0).
+    * This condition is checked by vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT) ,vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT) , vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT),CONST_1_INT))))
+    */
+
+    return vbslq_f32(vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT), vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT), vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT), CONST_1_INT)))),
+                     floor_val, vaddq_f32(floor_val, CONST_1_FLOAT));
+#endif // __aarch64__
+}
+
+inline float32x2_t vinvsqrt_f32(float32x2_t x)
+{
+    float32x2_t sqrt_reciprocal = vrsqrte_f32(x);
+    sqrt_reciprocal             = vmul_f32(vrsqrts_f32(vmul_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal             = vmul_f32(vrsqrts_f32(vmul_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+
+    return sqrt_reciprocal;
+}
+
+inline float32x4_t vinvsqrtq_f32(float32x4_t x)
+{
+    float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
+    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+
+    return sqrt_reciprocal;
+}
+
+inline float32x2_t vinv_f32(float32x2_t x)
+{
+    float32x2_t recip = vrecpe_f32(x);
+    recip             = vmul_f32(vrecps_f32(x, recip), recip);
+    recip             = vmul_f32(vrecps_f32(x, recip), recip);
+    return recip;
+}
+
+inline float32x4_t vinvq_f32(float32x4_t x)
+{
+    float32x4_t recip = vrecpeq_f32(x);
+    recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
+    recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
+    return recip;
+}
+
+inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array<float32x4_t, 8> &coeffs)
+{
+    float32x4_t A   = vmlaq_f32(coeffs[0], coeffs[4], x);
+    float32x4_t B   = vmlaq_f32(coeffs[2], coeffs[6], x);
+    float32x4_t C   = vmlaq_f32(coeffs[1], coeffs[5], x);
+    float32x4_t D   = vmlaq_f32(coeffs[3], coeffs[7], x);
+    float32x4_t x2  = vmulq_f32(x, x);
+    float32x4_t x4  = vmulq_f32(x2, x2);
+    float32x4_t res = vmlaq_f32(vmlaq_f32(A, B, x2), vmlaq_f32(C, D, x2), x4);
+    return res;
+}
+
+inline float32x4_t vexpq_f32(float32x4_t x)
+{
+    static const float32x4_t CONST_LN2          = vdupq_n_f32(0.6931471805f); // ln(2)
+    static const float32x4_t CONST_INV_LN2      = vdupq_n_f32(1.4426950408f); // 1/ln(2)
+    static const float32x4_t CONST_INF          = vdupq_n_f32(std::numeric_limits<float>::infinity());
+    static const float32x4_t CONST_MAX_INPUT    = vdupq_n_f32(88.7f);
+    static const float32x4_t CONST_0            = vdupq_n_f32(0.f);
+    static const int32x4_t   CONST_NEGATIVE_126 = vdupq_n_s32(-126);
+
+    // Perform range reduction [-log(2),log(2)]
+    int32x4_t   m   = vcvtq_s32_f32(vmulq_f32(x, CONST_INV_LN2));
+    float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), CONST_LN2);
+
+    // Polynomial Approximation
+    float32x4_t poly = vtaylor_polyq_f32(val, exp_tab);
+
+    // Reconstruct
+    poly = vreinterpretq_f32_s32(vqaddq_s32(vreinterpretq_s32_f32(poly), vqshlq_n_s32(m, 23)));
+    poly = vbslq_f32(vcltq_s32(m, CONST_NEGATIVE_126), CONST_0, poly); // Handle underflow
+    poly = vbslq_f32(vcgtq_f32(x, CONST_MAX_INPUT), CONST_INF, poly);  // Handle overflow
+
+    return poly;
+}
+
+inline float32x4_t vlogq_f32(float32x4_t x)
+{
+    static const int32x4_t   CONST_127 = vdupq_n_s32(127);           // 127
+    static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
+
+    // Extract exponent
+    int32x4_t   m   = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
+    float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
+
+    // Polynomial Approximation
+    float32x4_t poly = vtaylor_polyq_f32(val, log_tab);
+
+    // Reconstruct
+    poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
+
+    return poly;
+}
+
+inline float32x4_t vtanhq_f32(float32x4_t val)
+{
+    static const float32x4_t CONST_1        = vdupq_n_f32(1.f);
+    static const float32x4_t CONST_2        = vdupq_n_f32(2.f);
+    static const float32x4_t CONST_MIN_TANH = vdupq_n_f32(-10.f);
+    static const float32x4_t CONST_MAX_TANH = vdupq_n_f32(10.f);
+
+    float32x4_t x     = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH);
+    float32x4_t exp2x = vexpq_f32(vmulq_f32(CONST_2, x));
+    float32x4_t num   = vsubq_f32(exp2x, CONST_1);
+    float32x4_t den   = vaddq_f32(exp2x, CONST_1);
+    float32x4_t tanh  = vmulq_f32(num, vinvq_f32(den));
+    return tanh;
+}
+
+inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n)
+{
+    return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
+}
+
+inline float32x4_t vsinq_f32(float32x4_t val)
+{
+    const float32x4_t pi_v   = vdupq_n_f32(M_PI);
+    const float32x4_t pio2_v = vdupq_n_f32(M_PI / 2);
+    const float32x4_t ipi_v  = vdupq_n_f32(1 / M_PI);
+
+    //Find positive or negative
+    const int32x4_t  c_v    = vabsq_s32(vcvtq_s32_f32(vmulq_f32(val, ipi_v)));
+    const uint32x4_t sign_v = vcleq_f32(val, vdupq_n_f32(0));
+    const uint32x4_t odd_v  = vandq_u32(vreinterpretq_u32_s32(c_v), vdupq_n_u32(1));
+
+    uint32x4_t neg_v = veorq_u32(odd_v, sign_v);
+
+    //Modulus a - (n * int(a*(1/n)))
+    float32x4_t      ma    = vsubq_f32(vabsq_f32(val), vmulq_f32(pi_v, vcvtq_f32_s32(c_v)));
+    const uint32x4_t reb_v = vcgeq_f32(ma, pio2_v);
+
+    //Rebase a between 0 and pi/2
+    ma = vbslq_f32(reb_v, vsubq_f32(pi_v, ma), ma);
+
+    //Taylor series
+    const float32x4_t ma2 = vmulq_f32(ma, ma);
+
+    //2nd elem: x^3 / 3!
+    float32x4_t elem = vmulq_f32(vmulq_f32(ma, ma2), vdupq_n_f32(te_sin_coeff2));
+    float32x4_t res  = vsubq_f32(ma, elem);
+
+    //3rd elem: x^5 / 5!
+    elem = vmulq_f32(vmulq_f32(elem, ma2), vdupq_n_f32(te_sin_coeff3));
+    res  = vaddq_f32(res, elem);
+
+    //4th elem: x^7 / 7!float32x2_t vsin_f32(float32x2_t val)
+    elem = vmulq_f32(vmulq_f32(elem, ma2), vdupq_n_f32(te_sin_coeff4));
+    res  = vsubq_f32(res, elem);
+
+    //5th elem: x^9 / 9!
+    elem = vmulq_f32(vmulq_f32(elem, ma2), vdupq_n_f32(te_sin_coeff5));
+    res  = vaddq_f32(res, elem);
+
+    //Change of sign
+    neg_v = vshlq_n_u32(neg_v, 31);
+    res   = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(res), neg_v));
+    return res;
+}
+
+inline float32x2_t vsin_f32(float32x2_t val)
+{
+    const float32x2_t pi_v   = vdup_n_f32(M_PI);
+    const float32x2_t pio2_v = vdup_n_f32(M_PI / 2);
+    const float32x2_t ipi_v  = vdup_n_f32(1 / M_PI);
+
+    //Find positive or negative
+    const int32x2_t  c_v    = vabs_s32(vcvt_s32_f32(vmul_f32(val, ipi_v)));
+    const uint32x2_t sign_v = vcle_f32(val, vdup_n_f32(0));
+    const uint32x2_t odd_v  = vand_u32(vreinterpret_u32_s32(c_v), vdup_n_u32(1));
+
+    uint32x2_t neg_v = veor_u32(odd_v, sign_v);
+
+    //Modulus a - (n * int(a*(1/n)))
+    float32x2_t      ma    = vsub_f32(vabs_f32(val), vmul_f32(pi_v, vcvt_f32_s32(c_v)));
+    const uint32x2_t reb_v = vcge_f32(ma, pio2_v);
+
+    //Rebase a between 0 and pi/2
+    ma = vbsl_f32(reb_v, vsub_f32(pi_v, ma), ma);
+
+    //Taylor series
+    const float32x2_t ma2 = vmul_f32(ma, ma);
+
+    //2nd elem: x^3 / 3!
+    float32x2_t elem = vmul_f32(vmul_f32(ma, ma2), vdup_n_f32(te_sin_coeff2));
+    float32x2_t res  = vsub_f32(ma, elem);
+
+    //3rd elem: x^5 / 5!
+    elem = vmul_f32(vmul_f32(elem, ma2), vdup_n_f32(te_sin_coeff3));
+    res  = vadd_f32(res, elem);
+
+    //4th elem: x^7 / 7!float32x2_t vsin_f32(float32x2_t val)
+    elem = vmul_f32(vmul_f32(elem, ma2), vdup_n_f32(te_sin_coeff4));
+    res  = vsub_f32(res, elem);
+
+    //5th elem: x^9 / 9!
+    elem = vmul_f32(vmul_f32(elem, ma2), vdup_n_f32(te_sin_coeff5));
+    res  = vadd_f32(res, elem);
+
+    //Change of sign
+    neg_v = vshl_n_u32(neg_v, 31);
+    res   = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(res), neg_v));
+    return res;
+}
+
+#endif /* DOXYGEN_SKIP_THIS */
+
+inline int32x4_t rounding_divide_by_pow2(int32x4_t x, int32x4_t exponent)
+{
+    const int32x4_t shift_vec  = vnegq_s32(exponent);
+    const int32x4_t fixup      = vshrq_n_s32(vandq_s32(x, shift_vec), 31);
+    const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
+    return vrshlq_s32(fixed_up_x, shift_vec);
+}
+
+inline int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent)
+{
+    const int32x4_t shift_vec  = vdupq_n_s32(-exponent);
+    const int32x4_t fixup      = vshrq_n_s32(vandq_s32(x, shift_vec), 31);
+    const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
+    return vrshlq_s32(fixed_up_x, shift_vec);
+}
+
+inline int32_t rounding_divide_by_pow2(int32_t x, int exponent)
+{
+    const int32_t mask      = (1 << exponent) - 1;
+    const int32_t threshold = (mask >> 1) + (x < 0 ? 1 : 0);
+    return (x >> exponent) + ((x & mask) > threshold ? 1 : 0);
+}
+
+inline float32x4x4_t convert_uint8x16_to_float32x4x4(const uint8x16_t &in)
+{
+    float32x4x4_t out;
+
+    const auto tmp1 = vmovl_u8(vget_low_u8(in));
+    out.val[0]      = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp1)));
+    out.val[1]      = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp1)));
+
+    const auto tmp2 = vmovl_u8(vget_high_u8(in));
+    out.val[2]      = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp2)));
+    out.val[3]      = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp2)));
+    return out;
+}
+
+inline float32x4x4_t convert_int8x16_to_float32x4x4(const int8x16_t &in)
+{
+    float32x4x4_t out;
+
+    const auto tmp1 = vmovl_s8(vget_low_s8(in));
+    out.val[0]      = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp1)));
+    out.val[1]      = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp1)));
+
+    const auto tmp2 = vmovl_s8(vget_high_s8(in));
+    out.val[2]      = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp2)));
+    out.val[3]      = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp2)));
+    return out;
+}
+
+template <>
+inline float32x4x4_t convert_to_float32x4x4(const uint8x16_t &in)
+{
+    return convert_uint8x16_to_float32x4x4(in);
+}
+
+template <>
+inline float32x4x4_t convert_to_float32x4x4(const int8x16_t &in)
+{
+    return convert_int8x16_to_float32x4x4(in);
+}
+
+inline void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x3_t &in2, uint8x8x3_t &out)
+{
+    out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])),
+                                         vqmovn_u32(vcvtq_u32_f32(in2.val[0]))));
+    out.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[1])),
+                                         vqmovn_u32(vcvtq_u32_f32(in2.val[1]))));
+    out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])),
+                                         vqmovn_u32(vcvtq_u32_f32(in2.val[2]))));
+}
+
+inline void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out)
+{
+    const auto low = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])),
+                                  vqmovn_u32(vcvtq_u32_f32(in.val[1])));
+    const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])),
+                                   vqmovn_u32(vcvtq_u32_f32(in.val[3])));
+    out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
+}
+
+inline void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out)
+{
+    const auto low = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[0])),
+                                  vqmovn_s32(vcvtq_s32_f32(in.val[1])));
+    const auto high = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[2])),
+                                   vqmovn_s32(vcvtq_s32_f32(in.val[3])));
+    out = vcombine_s8(vqmovn_s16(low), vqmovn_s16(high));
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+/** Exponent polynomial coefficients */
+/** Logarithm polynomial coefficients */
+#ifndef DOXYGEN_SKIP_THIS
+inline float16x8_t vfloorq_f16(float16x8_t val)
+{
+    static const float16x8_t CONST_1 = vdupq_n_f16(1.f);
+
+    const int16x8_t   z = vcvtq_s16_f16(val);
+    const float16x8_t r = vcvtq_f16_s16(z);
+
+    return vbslq_f16(vcgtq_f16(r, val), vsubq_f16(r, CONST_1), r);
+}
+
+inline float16x8_t vroundq_rte_f16(float16x8_t val)
+{
+    return vrndnq_f16(val);
+}
+
+inline float16x4_t vinvsqrt_f16(float16x4_t x)
+{
+    float16x4_t sqrt_reciprocal = vrsqrte_f16(x);
+    sqrt_reciprocal             = vmul_f16(vrsqrts_f16(vmul_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal             = vmul_f16(vrsqrts_f16(vmul_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    return sqrt_reciprocal;
+}
+
+inline float16x8_t vinvsqrtq_f16(float16x8_t x)
+{
+    float16x8_t sqrt_reciprocal = vrsqrteq_f16(x);
+    sqrt_reciprocal             = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal             = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    return sqrt_reciprocal;
+}
+
+inline float16x4_t vinv_f16(float16x4_t x)
+{
+    float16x4_t recip = vrecpe_f16(x);
+    recip             = vmul_f16(vrecps_f16(x, recip), recip);
+    recip             = vmul_f16(vrecps_f16(x, recip), recip);
+    return recip;
+}
+
+inline float16x8_t vinvq_f16(float16x8_t x)
+{
+    float16x8_t recip = vrecpeq_f16(x);
+    recip             = vmulq_f16(vrecpsq_f16(x, recip), recip);
+    recip             = vmulq_f16(vrecpsq_f16(x, recip), recip);
+    return recip;
+}
+
+inline float16x8_t vtanhq_f16(float16x8_t val)
+{
+    const float16x8_t CONST_1        = vdupq_n_f16(1.f);
+    const float16x8_t CONST_2        = vdupq_n_f16(2.f);
+    const float16x8_t CONST_MIN_TANH = vdupq_n_f16(-10.f);
+    const float16x8_t CONST_MAX_TANH = vdupq_n_f16(10.f);
+
+    const float16x8_t x     = vminq_f16(vmaxq_f16(val, CONST_MIN_TANH), CONST_MAX_TANH);
+    const float16x8_t exp2x = vexpq_f16(vmulq_f16(CONST_2, x));
+    const float16x8_t num   = vsubq_f16(exp2x, CONST_1);
+    const float16x8_t den   = vaddq_f16(exp2x, CONST_1);
+    const float16x8_t tanh  = vmulq_f16(num, vinvq_f16(den));
+    return tanh;
+}
+
+inline float16x8_t vtaylor_polyq_f16(float16x8_t x, const std::array<float16x8_t, 8> &coeffs)
+{
+    const float16x8_t A   = vaddq_f16(coeffs[0], vmulq_f16(coeffs[4], x));
+    const float16x8_t B   = vaddq_f16(coeffs[2], vmulq_f16(coeffs[6], x));
+    const float16x8_t C   = vaddq_f16(coeffs[1], vmulq_f16(coeffs[5], x));
+    const float16x8_t D   = vaddq_f16(coeffs[3], vmulq_f16(coeffs[7], x));
+    const float16x8_t x2  = vmulq_f16(x, x);
+    const float16x8_t x4  = vmulq_f16(x2, x2);
+    const float16x8_t res = vaddq_f16(vaddq_f16(A, vmulq_f16(B, x2)), vmulq_f16(vaddq_f16(C, vmulq_f16(D, x2)), x4));
+    return res;
+}
+
+inline float16x8_t vexpq_f16(float16x8_t x)
+{
+    // TODO (COMPMID-1535) : Revisit FP16 approximations
+    const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x));
+    const float32x4_t x_low  = vcvt_f32_f16(vget_low_f16(x));
+
+    const float16x8_t res = vcombine_f16(vcvt_f16_f32(vexpq_f32(x_low)), vcvt_f16_f32(vexpq_f32(x_high)));
+    return res;
+}
+
+inline float16x8_t vlogq_f16(float16x8_t x)
+{
+    // TODO (COMPMID-1535) : Revisit FP16 approximations
+    const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x));
+    const float32x4_t x_low  = vcvt_f32_f16(vget_low_f16(x));
+
+    const float16x8_t res = vcombine_f16(vcvt_f16_f32(vlogq_f32(x_low)), vcvt_f16_f32(vlogq_f32(x_high)));
+    return res;
+}
+
+inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n)
+{
+    // TODO (giaiod01) - COMPMID-1535
+    float32x4_t n0_f32   = vcvt_f32_f16(vget_low_f16(n));
+    float32x4_t n1_f32   = vcvt_f32_f16(vget_high_f16(n));
+    float32x4_t val0_f32 = vcvt_f32_f16(vget_low_f16(val));
+    float32x4_t val1_f32 = vcvt_f32_f16(vget_high_f16(val));
+
+    float32x4_t res0_f32 = vexpq_f32(vmulq_f32(n0_f32, vlogq_f32(val0_f32)));
+    float32x4_t res1_f32 = vexpq_f32(vmulq_f32(n1_f32, vlogq_f32(val1_f32)));
+
+    return vcombine_f16(vcvt_f16_f32(res0_f32), vcvt_f16_f32(res1_f32));
+}
+
+inline float16x8_t vsinq_f16(float16x8_t val)
+{
+    const float32x4_t val_high = vcvt_f32_f16(vget_high_f16(val));
+    const float32x4_t val_low  = vcvt_f32_f16(vget_low_f16(val));
+
+    const float32x4_t res_high = vsinq_f32(val_high);
+    const float32x4_t res_low  = vsinq_f32(val_low);
+
+    return vcombine_f16(vcvt_f16_f32(res_low), vcvt_f16_f32(res_high));
+}
+
+inline float16x4_t vsin_f16(float16x4_t val)
+{
+    const float32x4_t val_f32  = vcvt_f32_f16(val);
+    const float32x2_t val_high = vget_high_f32(val_f32);
+    const float32x2_t val_low  = vget_low_f32(val_f32);
+
+    const float32x2_t res_high = vsin_f32(val_high);
+    const float32x2_t res_low  = vsin_f32(val_low);
+
+    return vcvt_f16_f32(vcombine_f32(res_low, res_high));
+}
+
+#endif /* DOXYGEN_SKIP_THIS */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+} // namespace arm_compute
diff --git a/src/core/NEON/NESymm.h b/src/core/NEON/NESymm.h
new file mode 100644
index 0000000..e664457
--- /dev/null
+++ b/src/core/NEON/NESymm.h
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NESYMM_H
+#define ARM_COMPUTE_NESYMM_H
+
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/NEON/NEMath.h"
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+using qsymm8_t  = int8_t;  /**< 8 bit quantized symmetric scalar value */
+using qsymm16_t = int16_t; /**< 16 bit quantized symmetric scalar value */
+
+using qsymm16x8_t   = int16x8_t;   /**< 16 bit quantized symmetric vector with 8 elements */
+using qsymm16x8x2_t = int16x8x2_t; /**< 16 bit quantized symmetric vector with 16 elements */
+
+/** Performs final quantization step on 8 signed 16-bit elements
+ *
+ * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
+ *
+ * @param[in] in_s32                       Input to be quantized.
+ * @param[in] result_fixedpoint_multiplier Result multiplier parameter
+ * @param[in] result_shift                 Result shift parameter
+ * @param[in] min_s16                      Relu lower bound
+ * @param[in] max_s16                      Relu upper bound
+ *
+ * @return Quantized values
+ */
+template <bool is_bounded_relu>
+int16x8_t finalize_quantization_int16(int32x4x2_t &in_s32,
+                                      int          result_fixedpoint_multiplier,
+                                      int32_t      result_shift,
+                                      int16x8_t    min_s16,
+                                      int16x8_t    max_s16)
+{
+    if(result_shift < 0)
+    {
+        in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << -result_shift));
+        in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << -result_shift));
+
+        in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
+        in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
+    }
+    else
+    {
+        // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
+        in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
+        in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
+        // Round to the nearest division by a power-of-two using result_shift_s32
+        in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift);
+        in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift);
+    }
+
+    // Convert S32 to S16
+    int16x8_t out_s16 = vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1]));
+
+    if(is_bounded_relu)
+    {
+        out_s16 = vmaxq_s16(out_s16, min_s16);
+        out_s16 = vminq_s16(out_s16, max_s16);
+    }
+
+    return out_s16;
+}
+
+/** Performs final quantization step on single signed 16-bit element
+ *
+ * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
+ *
+ * @param[in] in_value                     Input to be quantized.
+ * @param[in] result_fixedpoint_multiplier Result multiplier parameter
+ * @param[in] result_shift                 Result shift parameter
+ * @param[in] min_s16                      Relu lower bound
+ * @param[in] max_s16                      Relu upper bound
+ *
+ * @return Quantized values
+ */
+template <bool is_bounded_relu>
+inline int16_t finalize_quantization_int16(int32_t in_value, int result_fixedpoint_multiplier,
+                                           int32_t result_shift, int16_t min_s16, int16_t max_s16)
+{
+    if(result_shift < 0)
+    {
+        const int64_t in_64 = static_cast<int64_t>(in_value) * (1 << (-result_shift)) * static_cast<int64_t>(result_fixedpoint_multiplier);
+        in_value            = static_cast<int32_t>((in_64 + (1 << 30)) >> 31);
+    }
+    else
+    {
+        // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
+        const int64_t in_64 = static_cast<int64_t>(in_value) * static_cast<int64_t>(result_fixedpoint_multiplier);
+        // Shift value by result_shift_s32
+        in_value = rounding_divide_by_pow2(static_cast<int32_t>((in_64 + (1 << 30)) >> 31), result_shift);
+    }
+
+    // Bound the result
+    int16_t out_s16 = static_cast<int16_t>(std::max<int32_t>(-32768, std::min<int32_t>(32767, in_value)));
+
+    if(is_bounded_relu)
+    {
+        out_s16 = static_cast<int16_t>(std::max(min_s16, std::min(max_s16, out_s16)));
+    }
+
+    return out_s16;
+}
+
+/** Dequantize a neon vector holding 8 16-bit quantized values.
+ *
+ * @param[in] qv    Input values to be dequantized.
+ * @param[in] scale Quantization scale
+ *
+ * @return Dequantized values in a neon vector
+ */
+inline float32x4x2_t vdequantize_int16(const int16x8_t &qv, float scale)
+{
+    const float32x4_t   vscale = vdupq_n_f32(scale);
+    const float32x4x2_t vdequantized_input =
+    {
+        {
+            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv))), vscale),
+            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv))), vscale)
+        }
+    };
+    return vdequantized_input;
+}
+
+/** Quantize a neon vector holding 8 floating point values.
+ *
+ * @param[in] qv    Input values to be quantized.
+ * @param[in] scale Quantization scale
+ *
+ * @return A neon vector holding the quantized values
+ */
+inline int16x8_t vquantize_int16(const float32x4x2_t &qv, float scale)
+{
+    const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
+
+    const int32x4x2_t rf =
+    {
+        {
+#ifdef __aarch64__
+            vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
+            vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
+#else  //__aarch64__
+            vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
+            vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
+#endif //__aarch64__
+        }
+    };
+    return vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
+}
+
+/** Dequantize a neon vector holding 16 16-bit quantized values.
+ *
+ * @param[in] qv Input values to be dequantized.
+ * @param[in] qi Quantization information to be used in the computation.
+ *
+ * @return Dequantized values in a neon vector
+ */
+inline float32x4x4_t vdequantize(const int16x8x2_t &qv, const UniformQuantizationInfo &qi)
+{
+    const float         scale  = qi.scale;
+    const float32x4_t   vscale = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input =
+    {
+        {
+            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[0]))), vscale),
+            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[0]))), vscale),
+            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[1]))), vscale),
+            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[1]))), vscale),
+        }
+    };
+    return vdequantized_input;
+}
+
+/** Quantize a neon vector holding 16 floating point values.
+ *
+ * @param[in] qv Input values to be quantized.
+ * @param[in] qi Quantization information to be used in the computation.
+ *
+ * @return A neon vector holding the quantized values
+ */
+inline qsymm16x8x2_t vquantize_qsymm16(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
+{
+    const float scale = qi.scale;
+    ARM_COMPUTE_ERROR_ON(scale == 0.f);
+    const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
+    const int32x4x4_t rf =
+    {
+        {
+#ifdef __aarch64__
+            vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
+            vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
+            vcvtnq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
+            vcvtnq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
+#else  //__aarch64__
+            vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
+            vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
+            vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
+            vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
+#endif //__aarch64__
+        }
+    };
+    const qsymm16x8x2_t res =
+    {
+        vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])),
+        vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])),
+    };
+
+    return res;
+}
+
+/** Multiply a neon vector using quantized multiplier and shift
+ *
+ * @param[in] input Input vector to mutiply values to be quantized.
+ * @param[in] qmul  Quantized multipler
+ * @param[in] shift Left bit shift
+ *
+ * @return A neon vector holding the multiplied value
+ */
+inline int32x4x2_t multiply_by_quantized_multiplier_2row(int32x4x2_t input, int32_t qmul, int32_t shift)
+{
+    const auto left_shift  = shift > 0 ? shift : 0;
+    const auto right_shift = shift > 0 ? 0 : -shift;
+    const auto one_shifted = 1 << left_shift;
+
+    int32x4x2_t result;
+    result.val[0] = rounding_divide_by_pow2(vqrdmulhq_n_s32(vmulq_n_s32(input.val[0], one_shifted), qmul), right_shift);
+    result.val[1] = rounding_divide_by_pow2(vqrdmulhq_n_s32(vmulq_n_s32(input.val[1], one_shifted), qmul), right_shift);
+
+    return result;
+}
+
+} // namespace arm_compute
+#endif // ARM_COMPUTE_NESYMM_H
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index b15df31..621af51 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
@@ -26,12 +26,12 @@
 #include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NESymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NESymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 #include <set>
diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
index 5f5a3e5..525e286 100644
--- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
@@ -27,8 +27,8 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <map>
 #include <string>
diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
index b2700d9..a3da750 100644
--- a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
@@ -24,11 +24,11 @@
 #include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
 
 #include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NESymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NESymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp
index 0ee6d0e..c7169d8 100644
--- a/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp
@@ -26,12 +26,12 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index 0651cf2..50e4647 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -25,15 +25,15 @@
 
 #include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
 
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <map>
 
diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
index fa8332e..caaa6c2 100644
--- a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,9 +25,9 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 #include <cstdint>
diff --git a/src/core/NEON/kernels/NEColorConvertKernel.cpp b/src/core/NEON/kernels/NEColorConvertKernel.cpp
index 1f07965..bc8c775 100644
--- a/src/core/NEON/kernels/NEColorConvertKernel.cpp
+++ b/src/core/NEON/kernels/NEColorConvertKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,12 +29,13 @@
 #include "arm_compute/core/IMultiImage.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/MultiImageInfo.h"
-#include "arm_compute/core/NEON/NEColorConvertHelper.inl"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
+#include "src/core/NEON/kernels/detail/NEColorConvertHelper.inl"
+
 using namespace arm_compute;
 
 NEColorConvertKernel::NEColorConvertKernel()
diff --git a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp
index d439f43..f40f121 100644
--- a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp
+++ b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,10 +26,10 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NECropKernel.cpp b/src/core/NEON/kernels/NECropKernel.cpp
index 03bc9f0..7c65e71 100644
--- a/src/core/NEON/kernels/NECropKernel.cpp
+++ b/src/core/NEON/kernels/NECropKernel.cpp
@@ -29,11 +29,11 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Window.h"
 
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/helpers/bit_ops.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
index 6926ec1..6066326 100644
--- a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
@@ -27,13 +27,13 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <cstdint>
 
diff --git a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
index 5df3e3e..ee23909 100644
--- a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
@@ -27,13 +27,13 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/SaturateCast.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
 
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 using namespace arm_compute;
 
diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
index d012cbf..6465848 100644
--- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
@@ -25,10 +25,10 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include <arm_neon.h>
 #include <cstdint>
 
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
index 62b2531..fb47879 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
@@ -25,11 +25,11 @@
 
 #include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/NEON/wrapper/traits.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp"
+#include "src/core/NEON/wrapper/traits.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "support/ToolchainSupport.h"
 
 namespace arm_compute
diff --git a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
index fc0933b..9352088 100644
--- a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,12 +27,12 @@
 #include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NESymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NESymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index 559b673..ac1d6ae 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -22,7 +22,9 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
+
+#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CPP/Validate.h"
@@ -30,15 +32,13 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/NEON/NEFixedPoint.h"
 
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include <algorithm>
-#include <arm_neon.h>
 
 using namespace arm_compute;
 using namespace arm_compute::detail;
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index 8e2b88f..c022fa0 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
@@ -28,13 +28,13 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp b/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp
index da53a52..f862d04 100644
--- a/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp
@@ -26,9 +26,9 @@
 #include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 #include <map>
diff --git a/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp b/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp
index 747bd41..40430bd 100644
--- a/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp
+++ b/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp
@@ -27,8 +27,8 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "support/ToolchainSupport.h"
 
 namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
index c041b4c..de8ba3f 100644
--- a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,8 +34,8 @@
 #include <complex>
 #include <map>
 
-#include "arm_compute/core/NEON/wrapper/traits.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "src/core/NEON/wrapper/traits.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEFFTScaleKernel.cpp b/src/core/NEON/kernels/NEFFTScaleKernel.cpp
index ea2831f..d99ff95 100644
--- a/src/core/NEON/kernels/NEFFTScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTScaleKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,11 +24,11 @@
 #include "arm_compute/core/NEON/kernels/NEFFTScaleKernel.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEFloorKernel.cpp b/src/core/NEON/kernels/NEFloorKernel.cpp
index f078134..e134097 100644
--- a/src/core/NEON/kernels/NEFloorKernel.cpp
+++ b/src/core/NEON/kernels/NEFloorKernel.cpp
@@ -29,8 +29,8 @@
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/NEMath.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
index 282b1a6..00d251f 100644
--- a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
@@ -26,11 +26,11 @@
 #include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <map>
 
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
index e9332b2..8d0d7c2 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
@@ -27,13 +27,13 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp
index 458b94b..023b798 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp
@@ -27,12 +27,12 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
index 44d5565..68f16c5 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
@@ -27,13 +27,13 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NESymm.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/NEON/NESymm.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
index a0a5c5d..2ef32c4 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
@@ -27,13 +27,13 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/NEON/NEAsymm.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
index a926903..8fc33dc 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
@@ -27,13 +27,13 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/NEON/NEAsymm.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
index 3ac2efc..1494cd4 100644
--- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
@@ -26,8 +26,8 @@
 #include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
index 2cac93a..bd93146 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,9 +26,9 @@
 #include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/NEFixedPoint.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
index 5bec9d3..5c5367c 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,7 +29,6 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
@@ -37,6 +36,7 @@
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/helpers/float_ops.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/NEON/NEFixedPoint.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp
index 8a671bf..fc7b819 100644
--- a/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp
@@ -27,12 +27,12 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <cstdint>
 
diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
index f650d97..78acbc3 100644
--- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
@@ -28,12 +28,12 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
index dbcfda2..d99def5 100644
--- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
@@ -26,13 +26,13 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEMath.h"
 
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include <arm_neon.h>
 #include <cmath>
 
diff --git a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
index dd2824b..9eafe18 100644
--- a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,12 +28,12 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEFixedPoint.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
index 3fa4480..bcce843 100644
--- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,11 +26,11 @@
 #include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index 6cd0780..7b88826 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
@@ -27,13 +27,13 @@
 #include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEPadLayerKernel.cpp b/src/core/NEON/kernels/NEPadLayerKernel.cpp
index d840bb7..1b52117 100644
--- a/src/core/NEON/kernels/NEPadLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPadLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,11 +26,11 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
index 4466c24..c5320b9 100644
--- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
+++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
@@ -24,10 +24,10 @@
 #include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
 
 #include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NESymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NESymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index 9b5736a..1310ef3 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -28,17 +28,17 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
 #include "support/ToolchainSupport.h"
 
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include <algorithm>
 #include <arm_neon.h>
 #include <cmath>
@@ -2415,7 +2415,8 @@
 
                 // Store result
                 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off, (input_qinfo != output_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
-                                requant_qinfo) : vres);
+                                requant_qinfo) :
+                                vres);
             }
         }
 
diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
index cbfbda7..6a038f8 100644
--- a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
@@ -25,15 +25,16 @@
 
 #include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/NEON/NESymm.h"
-#include "arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/NESymm.h"
+
+#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
 
 #include <map>
 
diff --git a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
index 26ba401..6d5202d 100644
--- a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
@@ -25,12 +25,12 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include "arm_compute/core/CPP/Validate.h"
 
diff --git a/src/core/NEON/kernels/NERangeKernel.cpp b/src/core/NEON/kernels/NERangeKernel.cpp
index c8a456a..7d8fbb1 100644
--- a/src/core/NEON/kernels/NERangeKernel.cpp
+++ b/src/core/NEON/kernels/NERangeKernel.cpp
@@ -27,10 +27,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include "arm_compute/core/Utils.h"
 
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 1691f68..01534f7 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -29,14 +29,14 @@
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/SaturateCast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/NEON/NEMath.h"
 
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/kernels/NEReverseKernel.cpp b/src/core/NEON/kernels/NEReverseKernel.cpp
index 2e6135b..0c44a7e 100644
--- a/src/core/NEON/kernels/NEReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEReverseKernel.cpp
@@ -23,10 +23,10 @@
  */
 #include "arm_compute/core/NEON/kernels/NEReverseKernel.h"
 
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
index 1a85352..94f5a18 100644
--- a/src/core/NEON/kernels/NEScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp
@@ -26,10 +26,10 @@
 #include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/Rounding.h"
 #include "arm_compute/core/utils/misc/Utility.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include "src/core/utils/ScaleUtils.h"
 
diff --git a/src/core/NEON/kernels/NESelectKernel.cpp b/src/core/NEON/kernels/NESelectKernel.cpp
index 86e8233..286b8a6 100644
--- a/src/core/NEON/kernels/NESelectKernel.cpp
+++ b/src/core/NEON/kernels/NESelectKernel.cpp
@@ -28,10 +28,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "utils/TypePrinter.h"
 
 #include <arm_neon.h>
@@ -229,7 +229,7 @@
 
 Status NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(c, x, y); 
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(c, x, y);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(x);
     ARM_COMPUTE_RETURN_ERROR_ON(x->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, y);
diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
index bc5b0c0..e71818f 100644
--- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
@@ -28,14 +28,14 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/SaturateCast.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <algorithm>
 #include <arm_neon.h>
diff --git a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
index e2fe88c..ccad92a 100644
--- a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
@@ -25,10 +25,10 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include <arm_neon.h>
 #include <cstdint>
 
diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
index b342cd2..2667611 100644
--- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,10 +25,10 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include <arm_neon.h>
 #include <cstdint>
 
diff --git a/src/core/NEON/kernels/NEThresholdKernel.cpp b/src/core/NEON/kernels/NEThresholdKernel.cpp
index 6b291fd..9e8ec5c 100644
--- a/src/core/NEON/kernels/NEThresholdKernel.cpp
+++ b/src/core/NEON/kernels/NEThresholdKernel.cpp
@@ -28,7 +28,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
 
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp b/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
index 02cf133..69324c1 100644
--- a/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
@@ -27,11 +27,11 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp
index 171f596..d12b10c 100644
--- a/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp
@@ -27,12 +27,12 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <cstdint>
 
diff --git a/src/core/NEON/kernels/NEYOLOLayerKernel.cpp b/src/core/NEON/kernels/NEYOLOLayerKernel.cpp
index b61633d..591aa1e 100644
--- a/src/core/NEON/kernels/NEYOLOLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEYOLOLayerKernel.cpp
@@ -26,14 +26,15 @@
 #include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
+
+#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h
new file mode 100644
index 0000000..eef1be0
--- /dev/null
+++ b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H
+#define ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H
+
+#include "src/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+namespace detail
+{
+/** Dummy activation object */
+template <typename T, int S>
+struct dummy
+{
+    /** NEON vector type. */
+    using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
+
+    /** Construct a dummy activation object.
+     *
+     * @param[in] act_info Activation layer information.
+     */
+    explicit dummy(ActivationLayerInfo act_info)
+    {
+        ARM_COMPUTE_UNUSED(act_info);
+    }
+
+    /** Run activation function.
+     *
+     * @param[in] vval Vector of values.
+     */
+    void operator()(ExactType &vval)
+    {
+        ARM_COMPUTE_UNUSED(vval);
+    }
+
+    /** Run activation function.
+     *
+     * @param[in] val Scalar value.
+     */
+    void operator()(T &val)
+    {
+        ARM_COMPUTE_UNUSED(val);
+    }
+};
+/** Linear activation object */
+template <typename T, int S>
+struct linear
+{
+    /** NEON vector type. */
+    using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
+    /** NEON vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
+    /** Construct a Linear activation object.
+     *
+     * @param[in] act_info Activation layer information.
+     */
+    explicit linear(ActivationLayerInfo act_info)
+        : alpha(act_info.a()),
+          beta(act_info.b()),
+          valpha(wrapper::vdup_n(static_cast<T>(alpha), ExactTagType{})),
+          vbeta(wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}))
+    {
+    }
+
+    /** Run activation function.
+     *
+     * @param[in] vval Vector of values.
+     */
+    void operator()(ExactType &vval)
+    {
+        vval = wrapper::vmla(vbeta, vval, valpha);
+    }
+
+    /** Run activation function.
+     *
+     * @param[in] val Scalar value.
+     */
+    void operator()(T &val)
+    {
+        val = alpha * val + beta;
+    }
+
+    const T         alpha;  /**< Scalar alpha */
+    const T         beta;   /**< Scalar alpha */
+    const ExactType valpha; /**< Vector of alphas. */
+    const ExactType vbeta;  /**< Vector of betas. */
+};
+/** Square activation object */
+template <typename T, int S>
+struct square
+{
+    /** NEON vector type. */
+    using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
+    /** NEON vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
+    /** Construct a Square activation object.
+     *
+     * @param[in] act_info Activation layer information.
+     */
+    explicit square(ActivationLayerInfo act_info)
+    {
+        ARM_COMPUTE_UNUSED(act_info);
+    }
+
+    /** Run activation function.
+     *
+     * @param[in] vval Vector of values.
+     */
+    void operator()(ExactType &vval)
+    {
+        vval = wrapper::vmul(vval, vval);
+    }
+
+    /** Run activation function.
+     *
+     * @param[in] val Scalar value.
+     */
+    void operator()(T &val)
+    {
+        val = val * val;
+    }
+};
+/** Logistic activation object */
+template <typename T, int S>
+struct logistic
+{
+    /** NEON vector type. */
+    using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
+    /** NEON vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
+    /** Construct a Logistic activation object.
+     *
+     * @param[in] act_info Activation layer information.
+     */
+    explicit logistic(ActivationLayerInfo act_info)
+        : vone(wrapper::vdup_n(static_cast<T>(1), ExactTagType{}))
+    {
+        ARM_COMPUTE_UNUSED(act_info);
+    }
+
+    /** Run activation function.
+     *
+     * @param[in] vval Vector of values.
+     */
+    void operator()(ExactType &vval)
+    {
+        vval = wrapper::vinv(wrapper::vadd(vone, wrapper::vexpq(wrapper::vneg(vval))));
+    }
+
+    /** Run activation function.
+     *
+     * @param[in] val Scalar value.
+     */
+    void operator()(T &val)
+    {
+        val = 1 / (1 + std::exp(-val));
+    }
+
+    /** Vector of ones. */
+    const ExactType vone;
+};
+/** RELU activation object */
+template <typename T, int S>
+struct relu
+{
+    /** NEON vector type. */
+    using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
+    /** NEON vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
+    /** Construct a RELU activation object.
+     *
+     * @param[in] act_info Activation layer information.
+     */
+    explicit relu(ActivationLayerInfo act_info)
+        : vzero(wrapper::vdup_n(static_cast<T>(0), ExactTagType{}))
+    {
+        ARM_COMPUTE_UNUSED(act_info);
+    }
+
+    /** Run activation function.
+     *
+     * @param[in] vval Vector of values.
+     */
+    void operator()(ExactType &vval)
+    {
+        vval = wrapper::vmax(vzero, vval);
+    }
+
+    /** Run activation function.
+     *
+     * @param[in] val Scalar value.
+     */
+    void operator()(T &val)
+    {
+        val = std::max(static_cast<T>(0), val);
+    }
+
+    /** Vector of zeroes. */
+    const ExactType vzero;
+};
+/** Bounded RELU activation object */
+template <typename T, int S>
+struct brelu
+{
+    /** NEON vector type. */
+    using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
+    /** NEON vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
+    /** Construct a bounded RELU activation object.
+     *
+     * @param[in] act_info Activation layer information.
+     */
+    explicit brelu(ActivationLayerInfo act_info)
+        : alpha(act_info.a()),
+          vzero(wrapper::vdup_n(static_cast<T>(0), ExactTagType{})),
+          valpha(wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{}))
+    {
+    }
+
+    /** Run activation function.
+     *
+     * @param[in] vval Vector of values.
+     */
+    void operator()(ExactType &vval)
+    {
+        vval = wrapper::vmin(valpha, wrapper::vmax(vzero, vval));
+    }
+
+    /** Run activation function.
+     *
+     * @param[in] val Scalar value.
+     */
+    void operator()(T &val)
+    {
+        val = std::min(alpha, std::max(static_cast<T>(0), val));
+    }
+
+    const T         alpha;  /** Scalar alpha */
+    const ExactType vzero;  /** Vector of zeroes. */
+    const ExactType valpha; /** Vector of alphas. */
+};
+/** Lower-Upper Bounded RELU activation object */
+template <typename T, int S>
+struct lubrelu
+{
+    /** NEON vector type. */
+    using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
+    /** NEON vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
+    /** Construct a lower-upper bounded RELU activation object.
+     *
+     * @param[in] act_info Activation layer information.
+     */
+    explicit lubrelu(ActivationLayerInfo act_info)
+        : alpha(act_info.a()),
+          beta(act_info.b()),
+          valpha(wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{})),
+          vbeta(wrapper::vdup_n(static_cast<T>(act_info.b()), ExactTagType{}))
+    {
+    }
+
+    /** Run activation function.
+     *
+     * @param[in] vval Vector of values.
+     */
+    void operator()(ExactType &vval)
+    {
+        vval = wrapper::vmin(valpha, wrapper::vmax(vbeta, vval));
+    }
+
+    /** Run activation function.
+     *
+     * @param[in] val Scalar value.
+     */
+    void operator()(T &val)
+    {
+        val = std::min(alpha, std::max(beta, val));
+    }
+
+    const T         alpha;  /**< Scalar alpha */
+    const T         beta;   /**< Scalar alpha */
+    const ExactType valpha; /** Vector of alphas. */
+    const ExactType vbeta;  /** Vector of betas. */
+};
+} // namespace detail
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H */
diff --git a/src/core/NEON/kernels/detail/NEColorConvertHelper.inl b/src/core/NEON/kernels/detail/NEColorConvertHelper.inl
new file mode 100644
index 0000000..ac196d9
--- /dev/null
+++ b/src/core/NEON/kernels/detail/NEColorConvertHelper.inl
@@ -0,0 +1,1045 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IMultiImage.h"
+#include "arm_compute/core/Utils.h"
+#include "src/core/NEON/NEMath.h"
+
+#include <arm_neon.h>
+
+namespace
+{
+#ifndef DOXYGEN_SKIP_THIS
+constexpr float red_coef_bt709    = 1.5748F;
+constexpr float green_coef_bt709  = -0.1873f;
+constexpr float green_coef2_bt709 = -0.4681f;
+constexpr float blue_coef_bt709   = 1.8556f;
+
+constexpr float rgb2yuv_bt709_kr = 0.2126f;
+constexpr float rgb2yuv_bt709_kb = 0.0722f;
+// K_g = 1 - K_r - K_b
+constexpr float rgb2yuv_bt709_kg = 0.7152f;
+// C_u = 1 / (2 * (1 - K_b))
+constexpr float rgb2yuv_bt709_cu = 0.5389f;
+// C_v = 1 / (2 * (1 - K_r))
+constexpr float rgb2yuv_bt709_cv = 0.6350f;
+
+constexpr float rgb2u8_red_coef   = 0.2126f;
+constexpr float rgb2u8_green_coef = 0.7152f;
+constexpr float rgb2u8_blue_coef  = 0.0722f;
+
+inline float32x4_t rgb_to_greyscale_calculation(const float32x4_t &rcolor, const float32x4_t &gcolor, const float32x4_t &bcolor,
+                                                const float rcoef, const float gcoef, const float bcoef)
+{
+    float32x4_t greyscale = vmulq_n_f32(rcolor, rcoef);
+    greyscale             = vmlaq_n_f32(greyscale, gcolor, gcoef);
+    greyscale             = vmlaq_n_f32(greyscale, bcolor, bcoef);
+    return greyscale;
+}
+
+inline void rgb_to_u8_conversion(const uint8x16x3_t &in, uint8x16_t &out)
+{
+    float32x4x4_t out_float32;
+
+    //Conversion from 3(RGB) 4 uint8s to 3(RGB) 4 floats
+    const float32x4x4_t r_float32 = arm_compute::convert_uint8x16_to_float32x4x4(in.val[0]);
+    const float32x4x4_t g_float32 = arm_compute::convert_uint8x16_to_float32x4x4(in.val[1]);
+    const float32x4x4_t b_float32 = arm_compute::convert_uint8x16_to_float32x4x4(in.val[2]);
+
+    //New grayscale image = ( (RED_COEFF * R) + (GREEN_COEFF * G) + (BLUE_COEFF * B) )
+    //Computation of 1(Greyscale) 4 uint8 using 3(RGB) 4 uint8s float
+    out_float32.val[0] = rgb_to_greyscale_calculation(r_float32.val[0], g_float32.val[0], b_float32.val[0],
+                                                      rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef);
+
+    out_float32.val[1] = rgb_to_greyscale_calculation(r_float32.val[1], g_float32.val[1], b_float32.val[1],
+                                                      rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef);
+
+    out_float32.val[2] = rgb_to_greyscale_calculation(r_float32.val[2], g_float32.val[2], b_float32.val[2],
+                                                      rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef);
+
+    out_float32.val[3] = rgb_to_greyscale_calculation(r_float32.val[3], g_float32.val[3], b_float32.val[3],
+                                                      rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef);
+
+    //Conversion from 1(Greyscale) 4 floats to 1(Greyscale) 4 uint8s
+    arm_compute::convert_float32x4x4_to_uint8x16(out_float32, out);
+}
+
+inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &gvec, const float32x4_t &bvec,
+                                   float32x4_t &yvec, float32x4_t &uvec, float32x4_t &vvec)
+{
+    /*
+    Y'= 0.2126*R' + 0.7152*G' + 0.0722*B'
+    U'=-0.1146*R' - 0.3854*G' + 0.5000*B'
+    V'= 0.5000*R' - 0.4542*G' - 0.0458*B'
+    */
+    const auto c128 = vdupq_n_f32(128.f);
+
+    // Y = R * K_r + G * (1 - K_r - K_b) * B * K_b
+    yvec = vmulq_n_f32(rvec, rgb2yuv_bt709_kr);
+    yvec = vmlaq_n_f32(yvec, gvec, rgb2yuv_bt709_kg);
+    yvec = vmlaq_n_f32(yvec, bvec, rgb2yuv_bt709_kb);
+
+    // U = (B - Y) / (2 * (1 - K_b))
+    uvec = vsubq_f32(bvec, yvec);
+    uvec = vmlaq_n_f32(c128, uvec, rgb2yuv_bt709_cu);
+
+    // V = (R - Y) / (2 * (1 - K_r))
+    vvec = vsubq_f32(rvec, yvec);
+    vvec = vmlaq_n_f32(c128, vvec, rgb2yuv_bt709_cv);
+}
+
+inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uvec_val, const float32x4_t &yyvec_val,
+                                    float32x4_t vvec_val, unsigned char *output_ptr, const bool alpha)
+{
+    float32x4x3_t rgb1, rgb2;
+
+    // Compute: cb - 128 and cr - 128;
+    const auto c128 = vdupq_n_f32(128.f);
+    uvec_val        = vsubq_f32(uvec_val, c128);
+    vvec_val        = vsubq_f32(vvec_val, c128);
+
+    // Compute:
+    // r = 0.0000f*f_u + 1.5748f*f_v;
+    // g = 0.1873f*f_u - 0.4681f*f_v;
+    // b = 1.8556f*f_u + 0.0000f*f_v;
+    const auto red   = vmulq_n_f32(vvec_val, red_coef_bt709);
+    const auto blue  = vmulq_n_f32(uvec_val, blue_coef_bt709);
+    const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709),
+                                 vmulq_n_f32(vvec_val, green_coef2_bt709));
+
+    // Compute the final r,g,b values using y1 for the first texel and y2 for the second one.
+    // the result is stored in two float32x4x3_t which then are converted to one uint8x8x3_t
+    // and written back to memory using vst3 instruction
+
+    rgb1.val[0] = vaddq_f32(yvec_val, red);
+    rgb1.val[1] = vaddq_f32(yvec_val, green);
+    rgb1.val[2] = vaddq_f32(yvec_val, blue);
+
+    rgb2.val[0] = vaddq_f32(yyvec_val, red);
+    rgb2.val[1] = vaddq_f32(yyvec_val, green);
+    rgb2.val[2] = vaddq_f32(yyvec_val, blue);
+
+    uint8x8x3_t u8_rgb;
+    arm_compute::convert_float32x4x3_to_uint8x8x3(rgb1, rgb2, u8_rgb);
+
+    if(!alpha)
+    {
+        vst3_lane_u8(&output_ptr[0], u8_rgb, 0);
+        vst3_lane_u8(&output_ptr[3], u8_rgb, 4);
+        vst3_lane_u8(&output_ptr[6], u8_rgb, 1);
+        vst3_lane_u8(&output_ptr[9], u8_rgb, 5);
+        vst3_lane_u8(&output_ptr[12], u8_rgb, 2);
+        vst3_lane_u8(&output_ptr[15], u8_rgb, 6);
+        vst3_lane_u8(&output_ptr[18], u8_rgb, 3);
+        vst3_lane_u8(&output_ptr[21], u8_rgb, 7);
+    }
+    else
+    {
+        uint8x8x4_t u8_rgba;
+        u8_rgba.val[0] = u8_rgb.val[0];
+        u8_rgba.val[1] = u8_rgb.val[1];
+        u8_rgba.val[2] = u8_rgb.val[2];
+        u8_rgba.val[3] = vdup_n_u8(255);
+        vst4_lane_u8(&output_ptr[0], u8_rgba, 0);
+        vst4_lane_u8(&output_ptr[4], u8_rgba, 4);
+        vst4_lane_u8(&output_ptr[8], u8_rgba, 1);
+        vst4_lane_u8(&output_ptr[12], u8_rgba, 5);
+        vst4_lane_u8(&output_ptr[16], u8_rgba, 2);
+        vst4_lane_u8(&output_ptr[20], u8_rgba, 6);
+        vst4_lane_u8(&output_ptr[24], u8_rgba, 3);
+        vst4_lane_u8(&output_ptr[28], u8_rgba, 7);
+    }
+}
+
+inline uint8x16x3_t load_rgb(const unsigned char *const ptr, const bool alpha)
+{
+    uint8x16x3_t rgb;
+
+    if(alpha)
+    {
+        const auto tmp = vld4q_u8(ptr);
+        rgb.val[0]     = tmp.val[0];
+        rgb.val[1]     = tmp.val[1];
+        rgb.val[2]     = tmp.val[2];
+    }
+    else
+    {
+        rgb = vld3q_u8(ptr);
+    }
+
+    return rgb;
+}
+
+inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_bottom)
+{
+    // Convert the uint8x16_t to float32x4x4_t
+    const float32x4x4_t frvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vec_top.val[0]);
+    const float32x4x4_t fgvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vec_top.val[1]);
+    const float32x4x4_t fbvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vec_top.val[2]);
+
+    const float32x4x4_t frvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vec_bottom.val[0]);
+    const float32x4x4_t fgvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vec_bottom.val[1]);
+    const float32x4x4_t fbvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vec_bottom.val[2]);
+
+    float32x4x4_t fyvec_top, fuvec_top, fvvec_top;
+    float32x4x4_t fyvec_bottom, fuvec_bottom, fvvec_bottom;
+
+    for(auto i = 0; i < 4; ++i)
+    {
+        rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i],
+                               fyvec_top.val[i], fuvec_top.val[i], fvvec_top.val[i]);
+        rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i],
+                               fyvec_bottom.val[i], fuvec_bottom.val[i], fvvec_bottom.val[i]);
+    }
+
+    arm_compute::convert_float32x4x4_to_uint8x16(fyvec_top, vec_top.val[0]);
+    arm_compute::convert_float32x4x4_to_uint8x16(fuvec_top, vec_top.val[1]);
+    arm_compute::convert_float32x4x4_to_uint8x16(fvvec_top, vec_top.val[2]);
+    arm_compute::convert_float32x4x4_to_uint8x16(fyvec_bottom, vec_bottom.val[0]);
+    arm_compute::convert_float32x4x4_to_uint8x16(fuvec_bottom, vec_bottom.val[1]);
+    arm_compute::convert_float32x4x4_to_uint8x16(fvvec_bottom, vec_bottom.val[2]);
+}
+
+inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
+                              const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
+                              unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom,
+                              unsigned char *const __restrict out_uv)
+{
+    uint8x16x3_t vec_top, vec_bottom;
+    vec_top.val[0]    = rvec_top;
+    vec_top.val[1]    = gvec_top;
+    vec_top.val[2]    = bvec_top;
+    vec_bottom.val[0] = rvec_bottom;
+    vec_bottom.val[1] = gvec_bottom;
+    vec_bottom.val[2] = bvec_bottom;
+
+    rgb_to_yuv_conversion(vec_top, vec_bottom);
+
+    vst1q_u8(out_y_top, vec_top.val[0]);
+    vst1q_u8(out_y_bottom, vec_bottom.val[0]);
+
+    const auto uvec = vuzpq_u8(vec_top.val[1], vec_bottom.val[1]);
+    const auto vvec = vuzpq_u8(vec_top.val[2], vec_bottom.val[2]);
+    const auto utmp = vrhaddq_u8(uvec.val[0], uvec.val[1]);
+    const auto vtmp = vrhaddq_u8(vvec.val[0], vvec.val[1]);
+
+    uint8x8x2_t uvvec;
+    uvvec.val[0] = vhadd_u8(vget_low_u8(utmp), vget_high_u8(utmp));
+    uvvec.val[1] = vhadd_u8(vget_low_u8(vtmp), vget_high_u8(vtmp));
+
+    vst2_u8(out_uv, uvvec);
+}
+
+inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
+                              const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
+                              unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom,
+                              unsigned char *const __restrict out_u,
+                              unsigned char *const __restrict out_v)
+{
+    uint8x16x3_t vec_top, vec_bottom;
+    vec_top.val[0]    = rvec_top;
+    vec_top.val[1]    = gvec_top;
+    vec_top.val[2]    = bvec_top;
+    vec_bottom.val[0] = rvec_bottom;
+    vec_bottom.val[1] = gvec_bottom;
+    vec_bottom.val[2] = bvec_bottom;
+
+    rgb_to_yuv_conversion(vec_top, vec_bottom);
+
+    vst1q_u8(out_y_top, vec_top.val[0]);
+    vst1q_u8(out_y_bottom, vec_bottom.val[0]);
+
+    const auto uvvec_top    = vuzpq_u8(vec_top.val[1], vec_top.val[2]);
+    const auto uvvec_bottom = vuzpq_u8(vec_bottom.val[1], vec_bottom.val[2]);
+    const auto uvvec        = vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]),
+                                        vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1]));
+
+    vst1_u8(out_u, vget_low_u8(uvvec));
+    vst1_u8(out_v, vget_high_u8(uvvec));
+}
+
+inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, const uint8x16_t &bvec,
+                              unsigned char *const __restrict out_y,
+                              unsigned char *const __restrict out_u,
+                              unsigned char *const __restrict out_v)
+{
+    // Convert the uint8x16_t to float32x4x4_t
+    const float32x4x4_t frvec = arm_compute::convert_uint8x16_to_float32x4x4(rvec);
+    const float32x4x4_t fgvec = arm_compute::convert_uint8x16_to_float32x4x4(gvec);
+    const float32x4x4_t fbvec = arm_compute::convert_uint8x16_to_float32x4x4(bvec);
+
+    float32x4x4_t fyvec, fuvec, fvvec;
+    for(auto i = 0; i < 4; ++i)
+    {
+        rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i],
+                               fyvec.val[i], fuvec.val[i], fvvec.val[i]);
+    }
+
+    uint8x16_t yvec, uvec, vvec;
+    arm_compute::convert_float32x4x4_to_uint8x16(fyvec, yvec);
+    arm_compute::convert_float32x4x4_to_uint8x16(fuvec, uvec);
+    arm_compute::convert_float32x4x4_to_uint8x16(fvvec, vvec);
+
+    vst1q_u8(out_y, yvec);
+    vst1q_u8(out_u, uvec);
+    vst1q_u8(out_v, vvec);
+}
+#endif /* DOXYGEN_SKIP_THIS */
+}
+
+namespace arm_compute
+{
+/** Convert RGB to RGBX.
+ *
+ * @param[in]  input  Input RGB data buffer.
+ * @param[out] output Output RGBX buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
+void colorconvert_rgb_to_rgbx(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IImage *__restrict>(output);
+
+    Iterator in(input_ptr, win);
+    Iterator out(output_ptr, win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        const auto   ta1 = vld3q_u8(in.ptr());
+        uint8x16x4_t ta2;
+        ta2.val[0] = ta1.val[0];
+        ta2.val[1] = ta1.val[1];
+        ta2.val[2] = ta1.val[2];
+        ta2.val[3] = vdupq_n_u8(255);
+        vst4q_u8(out.ptr(), ta2);
+    },
+    in, out);
+}
+
+/** Convert RGB to U8.
+ *
+ * @param[in]  input  Input RGB data buffer.
+ * @param[out] output Output U8 buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
+void colorconvert_rgb_to_u8(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IImage *__restrict>(output);
+
+    Iterator in(input_ptr, win);
+    Iterator out(output_ptr, win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        const auto ta1 = vld3q_u8(in.ptr());
+        uint8x16_t ta2;
+        rgb_to_u8_conversion(ta1, ta2);
+        vst1q_u8(out.ptr(), ta2);
+    },
+    in, out);
+}
+
+/** Convert RGBX to RGB.
+ *
+ * @param[in]  input  Input RGBX data buffer.
+ * @param[out] output Output RGB buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
+void colorconvert_rgbx_to_rgb(const void *input, void *output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IImage *__restrict>(output);
+
+    Iterator in(input_ptr, win);
+    Iterator out(output_ptr, win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        const auto   ta1 = vld4q_u8(in.ptr());
+        uint8x16x3_t ta2;
+        ta2.val[0] = ta1.val[0];
+        ta2.val[1] = ta1.val[1];
+        ta2.val[2] = ta1.val[2];
+        vst3q_u8(out.ptr(), ta2);
+    },
+    in, out);
+}
+
+/** Convert YUYV to RGB.
+ *
+ * @param[in]  input  Input YUYV data buffer.
+ * @param[out] output Output RGB buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
+template <bool yuyv, bool alpha>
+void colorconvert_yuyv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IImage *__restrict>(output);
+
+    constexpr auto element_size = alpha ? 32 : 24;
+    constexpr auto shift        = yuyv ? 0 : 1;
+
+    Iterator in(input_ptr, win);
+    Iterator out(output_ptr, win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        const auto ta = vld4q_u8(in.ptr());
+        //ta.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta.val[1] = U0 U2 U4 U6 ...
+        //ta.val[2] = Y1 Y3 Y5 Y7 ...
+        //ta.val[3] = V0 V2 V4 V7 ...
+
+        // Convert the uint8x16x4_t to float32x4x4_t
+        const float32x4x4_t yvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[0 + shift]);
+        const float32x4x4_t uvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[1 - shift]);
+        const float32x4x4_t yyvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[2 + shift]);
+        const float32x4x4_t vvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[3 - shift]);
+
+        yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
+    },
+    in, out);
+}
+
+/** Convert NV12 to RGB.
+ *
+ * @param[in]  input  Input NV12 data buffer.
+ * @param[out] output Output RGB buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
+template <bool uv, bool alpha>
+void colorconvert_nv12_to_rgb(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
+    const auto output_ptr = static_cast<IImage *__restrict>(output);
+
+    constexpr auto element_size = alpha ? 32 : 24;
+    const auto     out_stride   = output_ptr->info()->strides_in_bytes().y();
+    constexpr auto shift        = uv ? 0 : 1;
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in_y(input_ptr->plane(0), win);
+    Iterator in_uv(input_ptr->plane(1), win_uv);
+    Iterator out(output_ptr, win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        const auto ta_y_top    = vld2q_u8(in_y.ptr());
+        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+        const auto ta_uv       = vld2q_u8(in_uv.ptr());
+        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+        //ta_uv.val[0] = U0 U2 U4 U6 ...
+        //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+        // Convert the uint8x16x4_t to float32x4x4_t
+        float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
+        float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
+        float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
+        float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
+        float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift]);
+        float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift]);
+
+        yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
+
+        yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
+    },
+    in_y, in_uv, out);
+}
+
+/** Convert IYUV to RGB.
+ *
+ * @param[in]  input  Input IYUV data buffer.
+ * @param[out] output Output RGB buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
+template <bool alpha>
+void colorconvert_iyuv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
+    const auto output_ptr = static_cast<IImage *__restrict>(output);
+
+    constexpr auto element_size = alpha ? 32 : 24;
+    const auto     out_stride   = output_ptr->info()->strides_in_bytes().y();
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in_y(input_ptr->plane(0), win);
+    Iterator in_u(input_ptr->plane(1), win_uv);
+    Iterator in_v(input_ptr->plane(2), win_uv);
+    Iterator out(output_ptr, win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        const auto *y_top_ptr    = in_y.ptr();
+        const auto *y_bottom_ptr = in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y();
+        const auto *u_ptr        = in_u.ptr();
+        const auto *v_ptr        = in_v.ptr();
+
+        // Work-around issue in gcc 9(>=) where vld2q might cause issues with register allocation
+#if defined(__arch64__)
+        const auto ta0_y_top    = vld1q_u8(y_top_ptr);
+        const auto ta1_y_top    = vld1q_u8(y_top_ptr + 16);
+        const auto ta0_y_bottom = vld1q_u8(y_bottom_ptr);
+        const auto ta1_y_bottom = vld1q_u8(y_bottom_ptr + 16);
+        const auto ta_u         = vld1q_u8(u_ptr);
+        const auto ta_v         = vld1q_u8(v_ptr);
+
+        // Convert the uint8x16x4_t to float32x4x4_t
+        float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_top, ta1_y_top));
+        float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_top, ta1_y_top));
+        float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_bottom, ta1_y_bottom));
+        float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_bottom, ta1_y_bottom));
+        float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
+        float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
+#else  /* defined(__arch64__) */
+        const auto ta_y_top    = vld2q_u8(y_top_ptr);
+        const auto ta_y_bottom = vld2q_u8(y_bottom_ptr);
+        const auto ta_u        = vld1q_u8(u_ptr);
+        const auto ta_v        = vld1q_u8(v_ptr);
+        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+        //ta_u.val[0] = U0 U2 U4 U6 ...
+        //ta_v.val[0] = V0 V2 V4 V6 ...
+
+        // Convert the uint8x16x4_t to float32x4x4_t
+        float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
+        float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
+        float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
+        float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
+        float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
+        float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
+#endif /* defined(__arch64__) */
+
+        yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
+
+        yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
+    },
+    in_y, in_u, in_v, out);
+}
+
+/** Convert YUYV to NV12.
+ *
+ * @param[in]  input  Input YUYV data buffer.
+ * @param[out] output Output NV12 buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
+template <bool yuyv>
+void colorconvert_yuyv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    constexpr auto shift = yuyv ? 0 : 1;
+
+    // NV12's UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in(input_ptr, win);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_uv(output_ptr->plane(1), win_uv);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        const auto ta_top    = vld4q_u8(in.ptr());
+        const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
+        //ta.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta.val[1] = U0 U2 U4 U6 ...
+        //ta.val[2] = Y1 Y3 Y5 Y7 ...
+        //ta.val[3] = V0 V2 V4 V7 ...
+
+        uint8x16x2_t yvec;
+        yvec.val[0] = ta_top.val[0 + shift];
+        yvec.val[1] = ta_top.val[2 + shift];
+        vst2q_u8(out_y.ptr(), yvec);
+
+        uint8x16x2_t yyvec;
+        yyvec.val[0] = ta_bottom.val[0 + shift];
+        yyvec.val[1] = ta_bottom.val[2 + shift];
+        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
+
+        uint8x16x2_t uvvec;
+        uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
+        uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
+        vst2q_u8(out_uv.ptr(), uvvec);
+    },
+    in, out_y, out_uv);
+}
+
+/** Convert IYUV to NV12.
+ *
+ * @param[in]  input  Input IYUV data buffer.
+ * @param[out] output Output NV12 buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
+void colorconvert_iyuv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in_y(input_ptr->plane(0), win);
+    Iterator in_u(input_ptr->plane(1), win_uv);
+    Iterator in_v(input_ptr->plane(2), win_uv);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_uv(output_ptr->plane(1), win_uv);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        const auto   ta_y_top    = vld2q_u8(in_y.ptr());
+        const auto   ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+        uint8x16x2_t ta_uv;
+        ta_uv.val[0] = vld1q_u8(in_u.ptr());
+        ta_uv.val[1] = vld1q_u8(in_v.ptr());
+        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+        //ta_uv.val[0] = U0 U2 U4 U6 ...
+        //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+        vst2q_u8(out_y.ptr(), ta_y_top);
+        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+        vst2q_u8(out_uv.ptr(), ta_uv);
+    },
+    in_y, in_u, in_v, out_y, out_uv);
+}
+
+/** Convert NV12 to IYUV.
+ *
+ * @param[in]  input  Input NV12 data buffer.
+ * @param[out] output Output IYUV buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
+template <bool uv>
+void colorconvert_nv12_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    constexpr auto shift = uv ? 0 : 1;
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in_y(input_ptr->plane(0), win);
+    Iterator in_uv(input_ptr->plane(1), win_uv);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_u(output_ptr->plane(1), win_uv);
+    Iterator out_v(output_ptr->plane(2), win_uv);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        const auto ta_y_top    = vld2q_u8(in_y.ptr());
+        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+        const auto ta_uv       = vld2q_u8(in_uv.ptr());
+        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+        //ta_uv.val[0] = U0 U2 U4 U6 ...
+        //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+        vst2q_u8(out_y.ptr(), ta_y_top);
+        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+        vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]);
+        vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]);
+    },
+    in_y, in_uv, out_y, out_u, out_v);
+}
+
+/** Convert YUYV to IYUV.
+ *
+ * @param[in]  input  Input YUYV data buffer.
+ * @param[out] output Output IYUV buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
+template <bool yuyv>
+void colorconvert_yuyv_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    constexpr auto shift = yuyv ? 0 : 1;
+
+    // Destination's UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in(input_ptr, win);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_u(output_ptr->plane(1), win_uv);
+    Iterator out_v(output_ptr->plane(2), win_uv);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        const auto ta_top    = vld4q_u8(in.ptr());
+        const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
+        //ta.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta.val[1] = U0 U2 U4 U6 ...
+        //ta.val[2] = Y1 Y3 Y5 Y7 ...
+        //ta.val[3] = V0 V2 V4 V7 ...
+
+        uint8x16x2_t yvec;
+        yvec.val[0] = ta_top.val[0 + shift];
+        yvec.val[1] = ta_top.val[2 + shift];
+        vst2q_u8(out_y.ptr(), yvec);
+
+        uint8x16x2_t yyvec;
+        yyvec.val[0] = ta_bottom.val[0 + shift];
+        yyvec.val[1] = ta_bottom.val[2 + shift];
+        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
+
+        uint8x16_t uvec;
+        uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
+        vst1q_u8(out_u.ptr(), uvec);
+
+        uint8x16_t vvec;
+        vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
+        vst1q_u8(out_v.ptr(), vvec);
+    },
+    in, out_y, out_u, out_v);
+}
+
+/** Convert NV12 to YUV4.
+ *
+ * @param[in]  input  Input NV12 data buffer.
+ * @param[out] output Output YUV4 buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
+template <bool uv>
+void colorconvert_nv12_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    constexpr auto shift = uv ? 0 : 1;
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in_y(input_ptr->plane(0), win);
+    Iterator in_uv(input_ptr->plane(1), win_uv);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_u(output_ptr->plane(1), win);
+    Iterator out_v(output_ptr->plane(2), win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        const auto ta_y_top    = vld2q_u8(in_y.ptr());
+        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+        const auto ta_uv       = vld2q_u8(in_uv.ptr());
+        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+        //ta_uv.val[0] = U0 U2 U4 U6 ...
+        //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+        vst2q_u8(out_y.ptr(), ta_y_top);
+        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+
+        uint8x16x2_t uvec;
+        uvec.val[0] = ta_uv.val[0 + shift];
+        uvec.val[1] = ta_uv.val[0 + shift];
+        vst2q_u8(out_u.ptr(), uvec);
+        vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
+
+        uint8x16x2_t vvec;
+        vvec.val[0] = ta_uv.val[1 - shift];
+        vvec.val[1] = ta_uv.val[1 - shift];
+        vst2q_u8(out_v.ptr(), vvec);
+        vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
+    },
+    in_y, in_uv, out_y, out_u, out_v);
+}
+
+/** Convert IYUV to YUV4.
+ *
+ * @param[in]  input  Input IYUV data buffer.
+ * @param[out] output Output YUV4 buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
+void colorconvert_iyuv_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in_y(input_ptr->plane(0), win);
+    Iterator in_u(input_ptr->plane(1), win_uv);
+    Iterator in_v(input_ptr->plane(2), win_uv);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_u(output_ptr->plane(1), win);
+    Iterator out_v(output_ptr->plane(2), win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        const auto ta_y_top    = vld2q_u8(in_y.ptr());
+        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+        const auto ta_u        = vld1q_u8(in_u.ptr());
+        const auto ta_v        = vld1q_u8(in_v.ptr());
+        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+        //ta_u = U0 U2 U4 U6 ...
+        //ta_v = V0 V2 V4 V6 ...
+
+        vst2q_u8(out_y.ptr(), ta_y_top);
+        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+
+        uint8x16x2_t uvec;
+        uvec.val[0] = ta_u;
+        uvec.val[1] = ta_u;
+        vst2q_u8(out_u.ptr(), uvec);
+        vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
+
+        uint8x16x2_t vvec;
+        vvec.val[0] = ta_v;
+        vvec.val[1] = ta_v;
+        vst2q_u8(out_v.ptr(), vvec);
+        vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
+    },
+    in_y, in_u, in_v, out_y, out_u, out_v);
+}
+
+/** Convert RGB to NV12.
+ *
+ * @param[in]  input  Input RGB data buffer.
+ * @param[out] output Output NV12 buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
+template <bool alpha>
+void colorconvert_rgb_to_nv12(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in(input_ptr, win);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_uv(output_ptr->plane(1), win_uv);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
+        const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
+        //ta_rgb.val[0] = R0 R1 R2 R3 ...
+        //ta_rgb.val[1] = G0 G1 G2 G3 ...
+        //ta_rgb.val[2] = B0 B1 B2 B3 ...
+
+        store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
+                          ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
+                          out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
+                          out_uv.ptr());
+    },
+    in, out_y, out_uv);
+}
+
+/** Convert RGB to IYUV.
+ *
+ * @param[in]  input  Input RGB data buffer.
+ * @param[out] output Output IYUV buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
+template <bool alpha>
+void colorconvert_rgb_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in(input_ptr, win);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_u(output_ptr->plane(1), win_uv);
+    Iterator out_v(output_ptr->plane(2), win_uv);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
+        const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
+        //ta_rgb.val[0] = R0 R1 R2 R3 ...
+        //ta_rgb.val[1] = G0 G1 G2 G3 ...
+        //ta_rgb.val[2] = B0 B1 B2 B3 ...
+
+        store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
+                          ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
+                          out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
+                          out_u.ptr(), out_v.ptr());
+    },
+    in, out_y, out_u, out_v);
+}
+
+/** Convert RGB to YUV4.
+ *
+ * @param[in]  input  Input RGB data buffer.
+ * @param[out] output Output YUV4 buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
+template <bool alpha>
+void colorconvert_rgb_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    Iterator in(input_ptr, win);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_u(output_ptr->plane(1), win);
+    Iterator out_v(output_ptr->plane(2), win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        const auto ta_rgb = load_rgb(in.ptr(), alpha);
+        //ta_rgb.val[0] = R0 R1 R2 R3 ...
+        //ta_rgb.val[1] = G0 G1 G2 G3 ...
+        //ta_rgb.val[2] = B0 B1 B2 B3 ...
+
+        store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2],
+                          out_y.ptr(), out_u.ptr(), out_v.ptr());
+    },
+    in, out_y, out_u, out_v);
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h b/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h
new file mode 100644
index 0000000..96defbc
--- /dev/null
+++ b/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H
+#define ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace detail
+{
+inline float32x4x3_t load_matrix_row(const float *ptr)
+{
+    const float32x4x3_t r =
+    {
+        {
+            vld1q_dup_f32(ptr),
+            vld1q_dup_f32(1 + ptr),
+            vld1q_dup_f32(2 + ptr)
+        }
+    };
+    return r;
+}
+
+template <unsigned int stridex>
+float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2);
+
+template <>
+inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
+{
+    const float32x4x3_t vtop =
+    {
+        {
+            vld1q_f32(in_top),
+            vld1q_f32(in_top + 4),
+            vld1q_f32(in_top + 8)
+        }
+    };
+    const float32x4x3_t vmid =
+    {
+        {
+            vld1q_f32(in_mid),
+            vld1q_f32(in_mid + 4),
+            vld1q_f32(in_mid + 8)
+        }
+    };
+    const float32x4x3_t vlow =
+    {
+        {
+            vld1q_f32(in_low),
+            vld1q_f32(in_low + 4),
+            vld1q_f32(in_low + 8)
+        }
+    };
+    float32x4x2_t out =
+    {
+        {
+            vmulq_f32(vtop.val[0], m0.val[0]),
+            vmulq_f32(vtop.val[1], m0.val[0])
+        }
+    };
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
+
+    out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
+
+    out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
+
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
+
+    out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
+
+    out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
+    return out;
+}
+
+template <>
+inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
+{
+    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
+    return out;
+}
+
+template <>
+inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
+{
+    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
+    return out;
+}
+
+template <unsigned int stridex>
+void store_results(float *buffer, const float32x4x2_t &values);
+
+template <>
+void store_results<1>(float *buffer, const float32x4x2_t &values)
+{
+    vst1q_f32(buffer, values.val[0]);
+    vst1q_f32(buffer + 4, values.val[1]);
+}
+
+template <>
+void store_results<2>(float *buffer, const float32x4x2_t &values)
+{
+    vst1q_f32(buffer, values.val[0]);
+}
+
+template <>
+void store_results<3>(float *buffer, const float32x4x2_t &values)
+{
+    vst1_f32(buffer, vget_low_f32(values.val[0]));
+}
+
+template <unsigned int stridex>
+int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration);
+
+template <>
+int get_input_num_elems_processed<1>(unsigned int num_elems_written_per_iteration)
+{
+    return num_elems_written_per_iteration;
+}
+
+template <>
+int get_input_num_elems_processed<2>(unsigned int num_elems_written_per_iteration)
+{
+    return num_elems_written_per_iteration << 1;
+}
+
+template <>
+int get_input_num_elems_processed<3>(unsigned int num_elems_written_per_iteration)
+{
+    return num_elems_written_per_iteration * 3;
+}
+}
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H */
\ No newline at end of file
diff --git a/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
new file mode 100644
index 0000000..d7ee70a
--- /dev/null
+++ b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
@@ -0,0 +1,965 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H
+#define ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/utils/misc/Requires.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace detail
+{
+/** Loads a 3x3 matrix as a row  (float).
+ *
+ * @param[in] ptr            Pointer to a float 3x3 matrix.
+ * @param[in] weights_offset (Optional) Weights quantization offset.
+ *
+ * @return The loaded matrix.
+ */
+inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0)
+{
+    ARM_COMPUTE_UNUSED(weights_offset);
+    const float32x4x3_t r =
+    {
+        {
+            vld1q_dup_f32(ptr),
+            vld1q_dup_f32(1 + ptr),
+            vld1q_dup_f32(2 + ptr)
+        }
+    };
+    return r;
+}
+
+/** Loads a 3x3 matrix as a row (uint8_t/int8_t).
+ *
+ * @param[in] ptr            Pointer to a uint8_t/int8_t 3x3 matrix.
+ * @param[in] weights_offset (Optional) Weights quantization offset.
+ *
+ * @return The loaded matrix.
+ */
+template < typename T, REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
+inline int32x4x3_t load_matrix_row(const T *ptr, int weights_offset = 0)
+{
+    const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset);
+
+    /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
+       r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
+    int32x4x3_t r =
+    {
+        {
+            vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)),
+            vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))),
+            vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2)))
+        }
+    };
+    return r;
+}
+
+/** Stores a float32x4x2_t array into a memory location.
+ *
+ * @param[in] buffer Pointer to the memory location where the values will be stored.
+ * @param[in] values Values that will be stored.
+ *
+ */
+template <unsigned int stridex>
+void store_results(float *buffer, const float32x4x2_t &values);
+
+template <>
+inline void store_results<1>(float *buffer, const float32x4x2_t &values)
+{
+    vst1q_f32(buffer, values.val[0]);
+    vst1q_f32(buffer + 4, values.val[1]);
+}
+
+template <>
+inline void store_results<2>(float *buffer, const float32x4x2_t &values)
+{
+    vst1q_f32(buffer, values.val[0]);
+}
+
+template <>
+inline void store_results<3>(float *buffer, const float32x4x2_t &values)
+{
+    vst1_f32(buffer, vget_low_f32(values.val[0]));
+}
+
+/** Stores a uint32_t array into a memory location.
+ *
+ * @param[in] buffer Pointer to the memory location where the values will be stored.
+ * @param[in] values Values that will be stored.
+ *
+ */
+template <unsigned int stridex>
+void store_results(int32_t *buffer, const int32x4x2_t &values);
+
+template <>
+inline void store_results<1>(int32_t *buffer, const int32x4x2_t &values)
+{
+    vst1q_s32(buffer, values.val[0]);
+    vst1q_s32(buffer + 4, values.val[1]);
+}
+
+template <>
+inline void store_results<2>(int32_t *buffer, const int32x4x2_t &values)
+{
+    vst1q_s32(buffer, values.val[0]);
+}
+
+template <>
+inline void store_results<3>(int32_t *buffer, const int32x4x2_t &values)
+{
+    vst1_s32(buffer, vget_low_s32(values.val[0]));
+}
+
+template <unsigned int stridex>
+inline void accumulate_results(float *buffer, const float32x4x2_t &values);
+
+template <>
+inline void accumulate_results<1>(float *buffer, const float32x4x2_t &values)
+{
+    vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));
+    vst1q_f32(buffer + 4, vaddq_f32(vld1q_f32(buffer + 4), values.val[1]));
+}
+
+template <>
+inline void accumulate_results<2>(float *buffer, const float32x4x2_t &values)
+{
+    vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));
+}
+
+template <>
+inline void accumulate_results<3>(float *buffer, const float32x4x2_t &values)
+{
+    vst1_f32(buffer, vadd_f32(vld1_f32(buffer), vget_low_f32(values.val[0])));
+}
+
+template <unsigned int stridex>
+void accumulate_results(int32_t *buffer, const int32x4x2_t &values);
+
+template <>
+inline void accumulate_results<1>(int32_t *buffer, const int32x4x2_t &values)
+{
+    vst1q_s32(buffer, vaddq_s32(vld1q_s32(buffer), values.val[0]));
+    vst1q_s32(buffer + 4, vaddq_s32(vld1q_s32(buffer + 4), values.val[1]));
+}
+
+template <>
+inline void accumulate_results<2>(int32_t *buffer, const int32x4x2_t &values)
+{
+    vst1q_s32(buffer, vaddq_s32(vld1q_s32(buffer), values.val[0]));
+}
+
+template <>
+inline void accumulate_results<3>(int32_t *buffer, const int32x4x2_t &values)
+{
+    vst1_s32(buffer, vadd_s32(vld1_s32(buffer), vget_low_s32(values.val[0])));
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+/** Stores a float16x8x2_t array into a memory location.
+ *
+ * @param[in] buffer Pointer to the memory location where the values will be stored.
+ * @param[in] values Values that will be stored.
+ *
+ */
+template <unsigned int stridex>
+void store_results(float16_t *buffer, const float16x8x2_t &values);
+
+template <>
+inline void store_results<1>(float16_t *buffer, const float16x8x2_t &values)
+{
+    vst1q_f16(buffer, values.val[0]);
+    vst1q_f16(buffer + 8, values.val[1]);
+}
+
+template <>
+inline void store_results<2>(float16_t *buffer, const float16x8x2_t &values)
+{
+    vst1q_f16(buffer, values.val[0]);
+}
+
+template <>
+inline void store_results<3>(float16_t *buffer, const float16x8x2_t &values)
+{
+    vst1_f16(buffer, vget_low_f16(values.val[0]));
+}
+
+template <unsigned int stridex>
+inline void accumulate_results(float16_t *buffer, const float16x8x2_t &values);
+
+template <>
+inline void accumulate_results<1>(float16_t *buffer, const float16x8x2_t &values)
+{
+    vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0]));
+    vst1q_f16(buffer + 8, vaddq_f16(vld1q_f16(buffer + 8), values.val[1]));
+}
+
+template <>
+inline void accumulate_results<2>(float16_t *buffer, const float16x8x2_t &values)
+{
+    vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0]));
+}
+
+template <>
+inline void accumulate_results<3>(float16_t *buffer, const float16x8x2_t &values)
+{
+    vst1_f16(buffer, vadd_f16(vld1_f16(buffer), vget_low_f16(values.val[0])));
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+/** Perform a 3x3 convolution for 4 consecutive elements on float32 when dilation.x() or dilation.y() is not 1.
+ *
+ * @param[in] in_top       Pointer to the first row of the input.
+ * @param[in] in_mid       Pointer to the second row of the input.
+ * @param[in] in_low       Pointer to the third row of the input.
+ * @param[in] m0           First row of the filter.
+ * @param[in] m1           Second row of the filter.
+ * @param[in] m2           Third row of the filter.
+ * @param[in] dilation_x   Dilation, in elements across x.
+ * @param[in] input_offset (Optional) Input quantization offset.
+ *
+ */
+inline float32x4_t single_convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low,
+                                                const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
+                                                const size_t dilation_x, int input_offset)
+{
+    ARM_COMPUTE_UNUSED(input_offset);
+
+    const float32x4x3_t vtop =
+    {
+        {
+            vld1q_f32(in_top),
+            vld1q_f32(in_top + dilation_x),
+            vld1q_f32(in_top + 2 * dilation_x)
+        }
+    };
+    const float32x4x3_t vmid =
+    {
+        {
+            vld1q_f32(in_mid),
+            vld1q_f32(in_mid + dilation_x),
+            vld1q_f32(in_mid + 2 * dilation_x)
+        }
+    };
+    const float32x4x3_t vlow =
+    {
+        {
+            vld1q_f32(in_low),
+            vld1q_f32(in_low + dilation_x),
+            vld1q_f32(in_low + 2 * dilation_x)
+        }
+    };
+    float32x4_t out = vmulq_f32(vtop.val[0], m0.val[0]);
+    out             = vmlaq_f32(out, vtop.val[1], m0.val[1]);
+    out             = vmlaq_f32(out, vtop.val[2], m0.val[2]);
+
+    out = vmlaq_f32(out, vmid.val[0], m1.val[0]);
+    out = vmlaq_f32(out, vmid.val[1], m1.val[1]);
+    out = vmlaq_f32(out, vmid.val[2], m1.val[2]);
+
+    out = vmlaq_f32(out, vlow.val[0], m2.val[0]);
+    out = vmlaq_f32(out, vlow.val[1], m2.val[1]);
+    out = vmlaq_f32(out, vlow.val[2], m2.val[2]);
+
+    return out;
+}
+
+/** Perform a 3x3 convolution for 8 consecutive elements on float32 when dilation.x() or dilation.y() is not 1.
+ *
+ * @param[in] in_top       Pointer to the first row of the input.
+ * @param[in] in_mid       Pointer to the second row of the input.
+ * @param[in] in_low       Pointer to the third row of the input.
+ * @param[in] m0           First row of the filter.
+ * @param[in] m1           Second row of the filter.
+ * @param[in] m2           Third row of the filter.
+ * @param[in] dilation_x   Dilation, in elements across x.
+ * @param[in] stridex      Stride value in elements across x.
+ * @param[in] input_offset (Optional) Input quantization offset.
+ *
+ */
+inline float32x4x2_t convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low,
+                                           const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
+                                           const size_t dilation_x, unsigned int stridex, int input_offset = 0)
+{
+    ARM_COMPUTE_ERROR_ON(stridex > 3);
+    float32x4x2_t out =
+    {
+        {
+            single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
+            single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
+        }
+    };
+
+    if(stridex == 2)
+    {
+        out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
+        out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
+        out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
+    }
+    else if(stridex == 3)
+    {
+        out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
+    }
+
+    return out;
+}
+
+/** Perform a convolve3x3 on float32.
+ *
+ * @param[in]  in_top       Pointer to the first row of the input.
+ * @param[in]  in_mid       Pointer to the second row of the input.
+ * @param[in]  in_low       Pointer to the third row of the input.
+ * @param[out] out_ptr      Pointer to the output.
+ * @param[in]  m0           First row of the filter.
+ * @param[in]  m1           Second row of the filter.
+ * @param[in]  m2           Third row of the filter.
+ * @param[in]  stridex      Stride value in elements across x.
+ * @param[in]  input_offset (Optional) Input quantization offset.
+ *
+ */
+template <bool accumulate>
+void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr,
+                  const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
+                  unsigned int stridex, int input_offset = 0);
+
+template <bool accumulate>
+inline void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr,
+                         const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
+                         unsigned int stridex, int input_offset)
+{
+    ARM_COMPUTE_UNUSED(input_offset);
+    ARM_COMPUTE_ERROR_ON(stridex > 3);
+
+    float32x4x2_t out =
+    {
+        {
+            vdupq_n_f32(0.f),
+            vdupq_n_f32(0.f)
+        }
+    };
+    if(stridex == 2)
+    {
+        const float32x4x2_t vtop     = vld2q_f32(in_top);
+        const float32x4x2_t vmid     = vld2q_f32(in_mid);
+        const float32x4x2_t vlow     = vld2q_f32(in_low);
+        const float32x4_t   vtop_end = vld1q_f32(in_top + 8);
+        const float32x4_t   vmid_end = vld1q_f32(in_mid + 8);
+        const float32x4_t   vlow_end = vld1q_f32(in_low + 8);
+
+        out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]);
+
+        out.val[0] = vmlaq_f32(out.val[0], vtop.val[1], m0.val[1]);
+        out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop_end, 1), m0.val[2]);
+
+        out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
+        out.val[0] = vmlaq_f32(out.val[0], vmid.val[1], m1.val[1]);
+        out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid_end, 1), m1.val[2]);
+
+        out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
+        out.val[0] = vmlaq_f32(out.val[0], vlow.val[1], m2.val[1]);
+        out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow_end, 1), m2.val[2]);
+
+        accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out);
+    }
+    else
+    {
+        const float32x4x3_t vtop =
+        {
+            {
+                vld1q_f32(in_top),
+                vld1q_f32(in_top + 4),
+                vld1q_f32(in_top + 8)
+            }
+        };
+        const float32x4x3_t vmid =
+        {
+            {
+                vld1q_f32(in_mid),
+                vld1q_f32(in_mid + 4),
+                vld1q_f32(in_mid + 8)
+            }
+        };
+        const float32x4x3_t vlow =
+        {
+            {
+                vld1q_f32(in_low),
+                vld1q_f32(in_low + 4),
+                vld1q_f32(in_low + 8)
+            }
+        };
+        out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]);
+        out.val[1] = vmulq_f32(vtop.val[1], m0.val[0]);
+
+        out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
+        out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
+
+        out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
+        out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
+        out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
+
+        out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
+        out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
+        out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
+
+        out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
+        out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
+
+        out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
+        out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
+        out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
+
+        out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
+        out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
+        out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
+
+        if(stridex == 3)
+        {
+            out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
+            accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
+        }
+        else
+        {
+            accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out);
+        }
+    }
+}
+
+/** Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1.
+ *
+ * @param[in] in_top       Pointer to the first row of the input.
+ * @param[in] in_mid       Pointer to the second row of the input.
+ * @param[in] in_low       Pointer to the third row of the input.
+ * @param[in] m0           First row of the filter.
+ * @param[in] m1           Second row of the filter.
+ * @param[in] m2           Third row of the filter.
+ * @param[in] dilation_x   Dilation, in elements across x.
+ * @param[in] input_offset Input quantization offset.
+ *
+ */
+template < typename T, REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
+inline int32x4_t single_convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low,
+                                              const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
+                                              size_t dilation_x, int32_t input_offset)
+{
+    using VectorType    = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x3_t, int8x8x3_t>::type;
+    using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
+
+    const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
+
+    const VectorType vtop =
+    {
+        {
+            wrapper::vload(in_top),
+            wrapper::vload(in_top + dilation_x),
+            wrapper::vload(in_top + 2 * dilation_x)
+        }
+    };
+    const VectorType vmid =
+    {
+        {
+            wrapper::vload(in_mid),
+            wrapper::vload(in_mid + dilation_x),
+            wrapper::vload(in_mid + 2 * dilation_x)
+        }
+    };
+    const VectorType vlow =
+    {
+        {
+            wrapper::vload(in_low),
+            wrapper::vload(in_low + dilation_x),
+            wrapper::vload(in_low + 2 * dilation_x)
+        }
+    };
+
+    const int32x4x3_t vtop_s32 =
+    {
+        {
+            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
+            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
+            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[2])))),
+        }
+    };
+    const int32x4x3_t vmid_s32 =
+    {
+        {
+            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
+            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
+            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[2])))),
+        }
+    };
+    const int32x4x3_t vlow_s32 =
+    {
+        {
+            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
+            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
+            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[2])))),
+        }
+    };
+
+    int32x4_t out = wrapper::vmul(vtop_s32.val[0], m0.val[0]);
+    out           = wrapper::vmla(out, vtop_s32.val[1], m0.val[1]);
+    out           = wrapper::vmla(out, vtop_s32.val[2], m0.val[2]);
+
+    out = wrapper::vmla(out, vmid_s32.val[0], m1.val[0]);
+    out = wrapper::vmla(out, vmid_s32.val[1], m1.val[1]);
+    out = wrapper::vmla(out, vmid_s32.val[2], m1.val[2]);
+
+    out = wrapper::vmla(out, vlow_s32.val[0], m2.val[0]);
+    out = wrapper::vmla(out, vlow_s32.val[1], m2.val[1]);
+    out = wrapper::vmla(out, vlow_s32.val[2], m2.val[2]);
+
+    return out;
+}
+
+/** Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1.
+ *
+ * @param[in] in_top       Pointer to the first row of the input.
+ * @param[in] in_mid       Pointer to the second row of the input.
+ * @param[in] in_low       Pointer to the third row of the input.
+ * @param[in] m0           First row of the filter.
+ * @param[in] m1           Second row of the filter.
+ * @param[in] m2           Third row of the filter.
+ * @param[in] dilation_x   Dilation, in elements across x.
+ * @param[in] stridex      Stride value in elements across x.
+ * @param[in] input_offset Input quantization offset.
+ *
+ */
+template < typename T, REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
+inline int32x4x2_t convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
+                                         const size_t dilation_x, unsigned int stridex, int input_offset)
+{
+    ARM_COMPUTE_ERROR_ON(stridex > 3);
+    int32x4x2_t out =
+    {
+        {
+            single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
+            single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
+        }
+    };
+
+    if(stridex == 2)
+    {
+        out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
+        out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
+        out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3);
+    }
+    else if(stridex == 3)
+    {
+        out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
+    }
+    return out;
+}
+
+/** Perform a convolve3x3 on 8-bit elements
+ *
+ * @param[in]  in_top       Pointer to the first row of the input.
+ * @param[in]  in_mid       Pointer to the second row of the input.
+ * @param[in]  in_low       Pointer to the third row of the input.
+ * @param[out] out_ptr      Pointer to the output.
+ * @param[in]  m0           First row of the filter.
+ * @param[in]  m1           Second row of the filter.
+ * @param[in]  m2           Third row of the filter.
+ * @param[in]  stridex      Stride value in elements across x.
+ * @param[in]  input_offset Input quantization offset.
+ *
+ */
+template < bool accumulate, typename T1, typename T2, REQUIRES_TA(std::is_same<T1, uint8_t>::value || std::is_same<T1, int8_t>::value) >
+void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_ptr,
+                  const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
+                  unsigned int stridex, int32_t input_offset)
+{
+    ARM_COMPUTE_ERROR_ON(stridex > 3);
+    using VectorType    = typename std::conditional<std::is_same<T1, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
+    using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
+
+    const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
+
+    const VectorType vtop =
+    {
+        {
+            wrapper::vload(in_top),
+            wrapper::vload(in_top + 8)
+        }
+    };
+    const VectorType vmid =
+    {
+        {
+            wrapper::vload(in_mid),
+            wrapper::vload(in_mid + 8)
+        }
+    };
+    const VectorType vlow =
+    {
+        {
+            wrapper::vload(in_low),
+            wrapper::vload(in_low + 8)
+        }
+    };
+
+    const int32x4x3_t vtop_s32 =
+    {
+        {
+            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
+            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))),
+            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
+        }
+    };
+    const int32x4x3_t vmid_s32 =
+    {
+        {
+            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
+            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))),
+            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
+        }
+    };
+    const int32x4x3_t vlow_s32 =
+    {
+        {
+            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
+            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))),
+            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
+        }
+    };
+
+    int32x4x2_t out
+    {
+        {
+            wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
+            wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
+        }
+    };
+
+    // 0
+    out.val[0] = wrapper::vmla(out.val[0], vtop_s32.val[0], m0.val[0]);
+    out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vtop_s32.val[0], vtop_s32.val[1]), m0.val[1]);
+    out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vtop_s32.val[0], vtop_s32.val[1]), m0.val[2]);
+
+    out.val[0] = wrapper::vmla(out.val[0], vmid_s32.val[0], m1.val[0]);
+    out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vmid_s32.val[0], vmid_s32.val[1]), m1.val[1]);
+    out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vmid_s32.val[0], vmid_s32.val[1]), m1.val[2]);
+
+    out.val[0] = wrapper::vmla(out.val[0], vlow_s32.val[0], m2.val[0]);
+    out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vlow_s32.val[0], vlow_s32.val[1]), m2.val[1]);
+    out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vlow_s32.val[0], vlow_s32.val[1]), m2.val[2]);
+
+    // 1
+    out.val[1] = wrapper::vmla(out.val[1], vtop_s32.val[1], m0.val[0]);
+    out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vtop_s32.val[1], vtop_s32.val[2]), m0.val[1]);
+    out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vtop_s32.val[1], vtop_s32.val[2]), m0.val[2]);
+
+    out.val[1] = wrapper::vmla(out.val[1], vmid_s32.val[1], m1.val[0]);
+    out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vmid_s32.val[1], vmid_s32.val[2]), m1.val[1]);
+    out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vmid_s32.val[1], vmid_s32.val[2]), m1.val[2]);
+
+    out.val[1] = wrapper::vmla(out.val[1], vlow_s32.val[1], m2.val[0]);
+    out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vlow_s32.val[1], vlow_s32.val[2]), m2.val[1]);
+    out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vlow_s32.val[1], vlow_s32.val[2]), m2.val[2]);
+
+    if(stridex == 1)
+    {
+        accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out);
+    }
+    else if(stridex == 2)
+    {
+        out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
+        out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
+        out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3);
+
+        accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out);
+    }
+    else if(stridex == 3)
+    {
+        out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
+        accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
+    }
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+/** Loads a 3x3 matrix as a row (float16_t).
+ *
+ * @param[in] ptr Pointer to a float 3x3 matrix.
+ *
+ * @return The loaded matrix.
+ */
+inline float16x8x3_t load_matrix_row(const float16_t *ptr, int weights_offset = 0)
+{
+    ARM_COMPUTE_UNUSED(weights_offset);
+    /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
+       r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
+    const float16x8x3_t r =
+    {
+        {
+            vld1q_dup_f16(ptr),
+            vld1q_dup_f16(1 + ptr),
+            vld1q_dup_f16(2 + ptr)
+        }
+    };
+    return r;
+}
+
+/** Perform a 3x3 convolution for 8 consecutive elements on float16 when dilation.x() or dilation.y() is not 1.
+ *
+ * @param[in] in_top       Pointer to the first row of the input.
+ * @param[in] in_mid       Pointer to the second row of the input.
+ * @param[in] in_low       Pointer to the third row of the input.
+ * @param[in] m0           First row of the filter.
+ * @param[in] m1           Second row of the filter.
+ * @param[in] m2           Third row of the filter.
+ * @param[in] dilation_x   Dilation, in elements across x.
+ * @param[in] input_offset (Optional)Input quantization offset.
+ *
+ */
+inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low,
+                                                const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
+                                                const size_t dilation_x, int input_offset = 0)
+{
+    ARM_COMPUTE_UNUSED(input_offset);
+    const float16x8x3_t vtop =
+    {
+        {
+            vld1q_f16(in_top),
+            vld1q_f16(in_top + dilation_x),
+            vld1q_f16(in_top + 2 * dilation_x)
+        }
+    };
+    const float16x8x3_t vmid =
+    {
+        {
+            vld1q_f16(in_mid),
+            vld1q_f16(in_mid + dilation_x),
+            vld1q_f16(in_mid + 2 * dilation_x)
+        }
+    };
+    const float16x8x3_t vlow =
+    {
+        {
+            vld1q_f16(in_low),
+            vld1q_f16(in_low + dilation_x),
+            vld1q_f16(in_low + 2 * dilation_x)
+        }
+    };
+    float16x8_t out = vmulq_f16(vtop.val[0], m0.val[0]);
+    out             = vaddq_f16(out, vmulq_f16(vtop.val[1], m0.val[1]));
+    out             = vaddq_f16(out, vmulq_f16(vtop.val[2], m0.val[2]));
+
+    out = vaddq_f16(out, vmulq_f16(vmid.val[0], m1.val[0]));
+    out = vaddq_f16(out, vmulq_f16(vmid.val[1], m1.val[1]));
+    out = vaddq_f16(out, vmulq_f16(vmid.val[2], m1.val[2]));
+
+    out = vaddq_f16(out, vmulq_f16(vlow.val[0], m2.val[0]));
+    out = vaddq_f16(out, vmulq_f16(vlow.val[1], m2.val[1]));
+    out = vaddq_f16(out, vmulq_f16(vlow.val[2], m2.val[2]));
+
+    return out;
+}
+
+/** Perform a 3x3 convolution for 16 consecutive elements on float16 when dilation.x() or dilation.y() is not 1.
+ *
+ * @param[in] in_top       Pointer to the first row of the input.
+ * @param[in] in_mid       Pointer to the second row of the input.
+ * @param[in] in_low       Pointer to the third row of the input.
+ * @param[in] m0           First row of the filter.
+ * @param[in] m1           Second row of the filter.
+ * @param[in] m2           Third row of the filter.
+ * @param[in] dilation_x   Dilation, in elements across x.
+ * @param[in] stridex      Stride value in elements across x.
+ * @param[in] input_offset (Optional) Input quantization offset.
+ *
+ */
+inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low,
+                                           const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
+                                           const size_t dilation_x, unsigned int stridex, int input_offset = 0)
+{
+    float16x8x2_t out =
+    {
+        {
+            single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
+            single_convolve_3x3_dilation(in_top + 8, in_mid + 8, in_low + 8, m0, m1, m2, dilation_x, input_offset)
+        }
+    };
+
+    if(stridex == 2)
+    {
+        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1);
+        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 4), out.val[0], 2);
+        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 3);
+        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 0), out.val[0], 4);
+        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 2), out.val[0], 5);
+        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 4), out.val[0], 6);
+        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 6), out.val[0], 7);
+    }
+    else if(stridex == 3)
+    {
+        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
+        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2);
+        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 1), out.val[0], 3);
+    }
+
+    return out;
+}
+
+/** Perform a convolve3x3 on float16.
+ *
+ * @param[in]  in_top       Pointer to the first row of the input.
+ * @param[in]  in_mid       Pointer to the second row of the input.
+ * @param[in]  in_low       Pointer to the third row of the input.
+ * @param[out] out_ptr      Pointer to the output.
+ * @param[in]  m0           First row of the filter.
+ * @param[in]  m1           Second row of the filter.
+ * @param[in]  m2           Third row of the filter.
+ * @param[in]  stridex      Stride value in elements across x.
+ * @param[in]  input_offset (Optional) Input quantization offset.
+ *
+ */
+template <bool accumulate>
+inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, float16_t *out_ptr,
+                         const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
+                         unsigned int stridex, int input_offset = 0)
+{
+    ARM_COMPUTE_UNUSED(input_offset);
+
+    float16x8x2_t out =
+    {
+        {
+            vdupq_n_f16(0),
+            vdupq_n_f16(0)
+        }
+    };
+    if(stridex == 2)
+    {
+        const float16x8x2_t vtop     = vld2q_f16(in_top);
+        const float16x8x2_t vmid     = vld2q_f16(in_mid);
+        const float16x8x2_t vlow     = vld2q_f16(in_low);
+        const float16x8_t   vtop_end = vld1q_f16(in_top + 16);
+        const float16x8_t   vmid_end = vld1q_f16(in_mid + 16);
+        const float16x8_t   vlow_end = vld1q_f16(in_low + 16);
+
+        out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]);
+
+        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vtop.val[1], m0.val[1]));
+        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop_end, 1), m0.val[2]));
+
+        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[0], m1.val[0]));
+        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[1], m1.val[1]));
+        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid_end, 1), m1.val[2]));
+
+        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[0], m2.val[0]));
+        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[1], m2.val[1]));
+        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow_end, 1), m2.val[2]));
+
+        accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out);
+    }
+    else
+    {
+        const float16x8x3_t vtop =
+        {
+            {
+                vld1q_f16(in_top),
+                vld1q_f16(in_top + 8),
+                vld1q_f16(in_top + 16)
+            }
+        };
+        const float16x8x3_t vmid =
+        {
+            {
+                vld1q_f16(in_mid),
+                vld1q_f16(in_mid + 8),
+                vld1q_f16(in_mid + 16)
+            }
+        };
+        const float16x8x3_t vlow =
+        {
+            {
+                vld1q_f16(in_low),
+                vld1q_f16(in_low + 8),
+                vld1q_f16(in_low + 16)
+            }
+        };
+        out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]);
+        out.val[1] = vmulq_f16(vtop.val[1], m0.val[0]);
+
+        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 1), m0.val[1]));
+        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 2), m0.val[2]));
+        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[0], m1.val[0]));
+        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 1), m1.val[1]));
+        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 2), m1.val[2]));
+        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[0], m2.val[0]));
+        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 1), m2.val[1]));
+        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 2), m2.val[2]));
+        out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 1), m0.val[1]));
+        out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 2), m0.val[2]));
+        out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vmid.val[1], m1.val[0]));
+        out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 1), m1.val[1]));
+        out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 2), m1.val[2]));
+        out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vlow.val[1], m2.val[0]));
+        out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 1), m2.val[1]));
+        out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 2), m2.val[2]));
+
+        if(stridex == 3)
+        {
+            out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
+            out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2);
+            out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 1), out.val[0], 3);
+
+            accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
+        }
+        else
+        {
+            accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out);
+        }
+    }
+}
+#endif /** __ARM_FEATURE_FP16_VECTOR_ARITHMETIC **/
+
+/** Get the number of elements processed on 3x3 convolution.
+ *
+ * @param[in] num_elems_written_per_iteration Number of elements written per iteration on 3x3 convolution.
+ * @param[in] stridex                         Stride value in elements across x.
+ *
+ * @return The number of elements processed.
+ */
+inline int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration, unsigned int stridex)
+{
+    switch(stridex)
+    {
+        case 1:
+            return num_elems_written_per_iteration;
+        case 2:
+            return num_elems_written_per_iteration << 1;
+        case 3:
+            return num_elems_written_per_iteration * 3;
+        default:
+            ARM_COMPUTE_ERROR("stridex not supported");
+            return 0;
+    }
+}
+}
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H */
diff --git a/src/core/NEON/wrapper/intrinsics/abs.h b/src/core/NEON/wrapper/intrinsics/abs.h
new file mode 100644
index 0000000..0d49a9e
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/abs.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_ABS_H
+#define ARM_COMPUTE_WRAPPER_ABS_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VABS_IMPL(stype, vtype, prefix, postfix) \
+    inline vtype vabs(const vtype &a)            \
+    {                                            \
+        return prefix##_##postfix(a);            \
+    }
+
+#define VQABS_IMPL(stype, vtype, prefix, postfix) \
+    inline vtype vqabs(const vtype &a)            \
+    {                                             \
+        return prefix##_##postfix(a);             \
+    }
+
+// Absolute: vabs{q}_<type>. Vd[i] = |Va[i]|
+VABS_IMPL(int8x8_t, int8x8_t, vabs, s8)
+VABS_IMPL(int16x4_t, int16x4_t, vabs, s16)
+VABS_IMPL(int32x2_t, int32x2_t, vabs, s32)
+VABS_IMPL(float32x2_t, float32x2_t, vabs, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VABS_IMPL(float16x4_t, float16x4_t, vabs, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VABS_IMPL(int8x16_t, int8x16_t, vabsq, s8)
+VABS_IMPL(int16x8_t, int16x8_t, vabsq, s16)
+VABS_IMPL(int32x4_t, int32x4_t, vabsq, s32)
+VABS_IMPL(float32x4_t, float32x4_t, vabsq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VABS_IMPL(float16x8_t, float16x8_t, vabsq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+// Saturating absolute: vqabs{q}_<type>. Vd[i] = sat(|Va[i]|)
+VQABS_IMPL(int8x8_t, int8x8_t, vqabs, s8)
+VQABS_IMPL(int16x4_t, int16x4_t, vqabs, s16)
+VQABS_IMPL(int32x2_t, int32x2_t, vqabs, s32)
+
+VQABS_IMPL(int8x16_t, int8x16_t, vqabsq, s8)
+VQABS_IMPL(int16x8_t, int16x8_t, vqabsq, s16)
+VQABS_IMPL(int32x4_t, int32x4_t, vqabsq, s32)
+
+#undef VABS_IMPL
+#undef VQABS_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_ABS_H */
diff --git a/src/core/NEON/wrapper/intrinsics/add.h b/src/core/NEON/wrapper/intrinsics/add.h
new file mode 100644
index 0000000..6134d75
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/add.h
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_ADD_H
+#define ARM_COMPUTE_WRAPPER_ADD_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VADD_IMPL(stype, vtype, prefix, postfix)      \
+    inline vtype vadd(const vtype &a, const vtype &b) \
+    {                                                 \
+        return prefix##_##postfix(a, b);              \
+    }
+
+VADD_IMPL(uint8x8_t, uint8x8_t, vadd, u8)
+VADD_IMPL(int8x8_t, int8x8_t, vadd, s8)
+VADD_IMPL(uint16x4_t, uint16x4_t, vadd, u16)
+VADD_IMPL(int16x4_t, int16x4_t, vadd, s16)
+VADD_IMPL(uint32x2_t, uint32x2_t, vadd, u32)
+VADD_IMPL(int32x2_t, int32x2_t, vadd, s32)
+VADD_IMPL(uint64x1_t, uint64x1_t, vadd, u64)
+VADD_IMPL(int64x1_t, int64x1_t, vadd, s64)
+VADD_IMPL(float32x2_t, float32x2_t, vadd, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VADD_IMPL(float16x4_t, float16x4_t, vadd, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VADD_IMPL(uint8x16_t, uint8x16_t, vaddq, u8)
+VADD_IMPL(int8x16_t, int8x16_t, vaddq, s8)
+VADD_IMPL(uint16x8_t, uint16x8_t, vaddq, u16)
+VADD_IMPL(int16x8_t, int16x8_t, vaddq, s16)
+VADD_IMPL(uint32x4_t, uint32x4_t, vaddq, u32)
+VADD_IMPL(int32x4_t, int32x4_t, vaddq, s32)
+VADD_IMPL(uint64x2_t, uint64x2_t, vaddq, u64)
+VADD_IMPL(int64x2_t, int64x2_t, vaddq, s64)
+VADD_IMPL(float32x4_t, float32x4_t, vaddq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VADD_IMPL(float16x8_t, float16x8_t, vaddq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#undef VADD_IMPL
+
+// VQADD: Vector saturating add (No notion of saturation for floating point)
+#define VQADD_IMPL(stype, vtype, prefix, postfix)      \
+    inline vtype vqadd(const vtype &a, const vtype &b) \
+    {                                                  \
+        return prefix##_##postfix(a, b);               \
+    }
+
+VQADD_IMPL(uint8x8_t, uint8x8_t, vqadd, u8)
+VQADD_IMPL(int8x8_t, int8x8_t, vqadd, s8)
+VQADD_IMPL(uint16x4_t, uint16x4_t, vqadd, u16)
+VQADD_IMPL(int16x4_t, int16x4_t, vqadd, s16)
+VQADD_IMPL(uint32x2_t, uint32x2_t, vqadd, u32)
+VQADD_IMPL(int32x2_t, int32x2_t, vqadd, s32)
+VQADD_IMPL(uint64x1_t, uint64x1_t, vqadd, u64)
+VQADD_IMPL(int64x1_t, int64x1_t, vqadd, s64)
+VQADD_IMPL(float32x2_t, float32x2_t, vadd, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VQADD_IMPL(float16x4_t, float16x4_t, vadd, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VQADD_IMPL(uint8x16_t, uint8x16_t, vqaddq, u8)
+VQADD_IMPL(int8x16_t, int8x16_t, vqaddq, s8)
+VQADD_IMPL(uint16x8_t, uint16x8_t, vqaddq, u16)
+VQADD_IMPL(int16x8_t, int16x8_t, vqaddq, s16)
+VQADD_IMPL(uint32x4_t, uint32x4_t, vqaddq, u32)
+VQADD_IMPL(int32x4_t, int32x4_t, vqaddq, s32)
+VQADD_IMPL(uint64x2_t, uint64x2_t, vqaddq, u64)
+VQADD_IMPL(int64x2_t, int64x2_t, vqaddq, s64)
+VQADD_IMPL(float32x4_t, float32x4_t, vaddq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VQADD_IMPL(float16x8_t, float16x8_t, vaddq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#undef VQADD_IMPL
+
+// VADDW: Vector widening add
+#define VADDW_IMPL(wtype, vtype, prefix, postfix)      \
+    inline wtype vaddw(const wtype &a, const vtype &b) \
+    {                                                  \
+        return prefix##_##postfix(a, b);               \
+    }
+
+VADDW_IMPL(uint16x8_t, uint8x8_t, vaddw, u8)
+VADDW_IMPL(int16x8_t, int8x8_t, vaddw, s8)
+VADDW_IMPL(uint32x4_t, uint16x4_t, vaddw, u16)
+VADDW_IMPL(int32x4_t, int16x4_t, vaddw, s16)
+VADDW_IMPL(uint64x2_t, uint32x2_t, vaddw, u32)
+VADDW_IMPL(int64x2_t, int32x2_t, vaddw, s32)
+#undef VADDW_IMPL
+
+// VADDL: Vector long add
+#define VADDL_IMPL(wtype, vtype, prefix, postfix)      \
+    inline wtype vaddl(const vtype &a, const vtype &b) \
+    {                                                  \
+        return prefix##_##postfix(a, b);               \
+    }
+
+VADDL_IMPL(uint16x8_t, uint8x8_t, vaddl, u8)
+VADDL_IMPL(int16x8_t, int8x8_t, vaddl, s8)
+VADDL_IMPL(uint32x4_t, uint16x4_t, vaddl, u16)
+VADDL_IMPL(int32x4_t, int16x4_t, vaddl, s16)
+VADDL_IMPL(uint64x2_t, uint32x2_t, vaddl, u32)
+VADDL_IMPL(int64x2_t, int32x2_t, vaddl, s32)
+#undef VADDL_IMPL
+
+#if defined(__aarch64__)
+// VADDV: Across vector add
+#define VADDV_IMPL(stype, vtype, prefix, postfix) \
+    inline stype vaddv(const vtype &a)            \
+    {                                             \
+        return prefix##_##postfix(a);             \
+    }
+
+VADDV_IMPL(uint8_t, uint8x8_t, vaddv, u8)
+VADDV_IMPL(int8_t, int8x8_t, vaddv, s8)
+VADDV_IMPL(uint16_t, uint16x4_t, vaddv, u16)
+VADDV_IMPL(int16_t, int16x4_t, vaddv, s16)
+VADDV_IMPL(uint32_t, uint32x2_t, vaddv, u32)
+VADDV_IMPL(int32_t, int32x2_t, vaddv, s32)
+VADDV_IMPL(float, float32x2_t, vaddv, f32)
+
+VADDV_IMPL(uint8_t, uint8x16_t, vaddvq, u8)
+VADDV_IMPL(int8_t, int8x16_t, vaddvq, s8)
+VADDV_IMPL(uint16_t, uint16x8_t, vaddvq, u16)
+VADDV_IMPL(int16_t, int16x8_t, vaddvq, s16)
+VADDV_IMPL(uint32_t, uint32x4_t, vaddvq, u32)
+VADDV_IMPL(int32_t, int32x4_t, vaddvq, s32)
+VADDV_IMPL(uint64_t, uint64x2_t, vaddvq, u64)
+VADDV_IMPL(int64_t, int64x2_t, vaddvq, s64)
+VADDV_IMPL(float, float32x4_t, vaddvq, f32)
+#undef VADDV_IMPL
+#endif // defined(__aarch64__)
+
+// VPADDL: Signed add long pairwise
+#define VPADDL_IMPL(ltype, vtype, prefix, postfix) \
+    inline ltype vpaddl(const vtype &a)            \
+    {                                              \
+        return prefix##_##postfix(a);              \
+    }
+
+VPADDL_IMPL(uint16x4_t, uint8x8_t, vpaddl, u8)
+VPADDL_IMPL(int16x4_t, int8x8_t, vpaddl, s8)
+VPADDL_IMPL(uint32x2_t, uint16x4_t, vpaddl, u16)
+VPADDL_IMPL(int32x2_t, int16x4_t, vpaddl, s16)
+VPADDL_IMPL(uint64x1_t, uint32x2_t, vpaddl, u32)
+VPADDL_IMPL(int64x1_t, int32x2_t, vpaddl, s32)
+
+VPADDL_IMPL(uint16x8_t, uint8x16_t, vpaddlq, u8)
+VPADDL_IMPL(int16x8_t, int8x16_t, vpaddlq, s8)
+VPADDL_IMPL(uint32x4_t, uint16x8_t, vpaddlq, u16)
+VPADDL_IMPL(int32x4_t, int16x8_t, vpaddlq, s16)
+VPADDL_IMPL(uint64x2_t, uint32x4_t, vpaddlq, u32)
+VPADDL_IMPL(int64x2_t, int32x4_t, vpaddlq, s32)
+#undef VPADDL_IMPL
+
+// VPADD: Add pairwise
+#define VPADD_IMPL(stype, vtype, prefix, postfix)      \
+    inline vtype vpadd(const vtype &a, const vtype &b) \
+    {                                                  \
+        return prefix##_##postfix(a, b);               \
+    }
+
+VPADD_IMPL(uint8x8_t, uint8x8_t, vpadd, u8)
+VPADD_IMPL(int8x8_t, int8x8_t, vpadd, s8)
+VPADD_IMPL(uint16x4_t, uint16x4_t, vpadd, u16)
+VPADD_IMPL(int16x4_t, int16x4_t, vpadd, s16)
+VPADD_IMPL(uint32x2_t, uint32x2_t, vpadd, u32)
+VPADD_IMPL(int32x2_t, int32x2_t, vpadd, s32)
+VPADD_IMPL(float32x2_t, float32x2_t, vpadd, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VPADD_IMPL(float16x4_t, float16x4_t, vpadd, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VPADD_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_ADD_H */
diff --git a/src/core/NEON/wrapper/intrinsics/and.h b/src/core/NEON/wrapper/intrinsics/and.h
new file mode 100644
index 0000000..6ff7df3
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/and.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_AND_H
+#define ARM_COMPUTE_WRAPPER_AND_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VAND_IMPL(stype, vtype, prefix, postfix)      \
+    inline vtype vand(const vtype &a, const vtype &b) \
+    {                                                 \
+        return prefix##_##postfix(a, b);              \
+    }
+
+VAND_IMPL(uint8_t, uint8x8_t, vand, u8)
+VAND_IMPL(int8_t, int8x8_t, vand, s8)
+VAND_IMPL(uint16_t, uint16x4_t, vand, u16)
+VAND_IMPL(int16_t, int16x4_t, vand, s16)
+VAND_IMPL(uint32_t, uint32x2_t, vand, u32)
+VAND_IMPL(int32_t, int32x2_t, vand, s32)
+VAND_IMPL(uint64_t, uint64x1_t, vand, u64)
+VAND_IMPL(int64_t, int64x1_t, vand, s64)
+
+VAND_IMPL(uint8_t, uint8x16_t, vandq, u8)
+VAND_IMPL(int8_t, int8x16_t, vandq, s8)
+VAND_IMPL(uint16_t, uint16x8_t, vandq, u16)
+VAND_IMPL(int16_t, int16x8_t, vandq, s16)
+VAND_IMPL(uint32_t, uint32x4_t, vandq, u32)
+VAND_IMPL(int32_t, int32x4_t, vandq, s32)
+VAND_IMPL(uint64_t, uint64x2_t, vandq, u64)
+VAND_IMPL(int64_t, int64x2_t, vandq, s64)
+
+#undef VAND_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_AND_H */
diff --git a/src/core/NEON/wrapper/intrinsics/bsl.h b/src/core/NEON/wrapper/intrinsics/bsl.h
new file mode 100644
index 0000000..01c1cce
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/bsl.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_BSL_H
+#define ARM_COMPUTE_WRAPPER_BSL_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VBSL_IMPL(stype, vtype, ctype, prefix, postfix)               \
+    inline vtype vbsl(const ctype &a, const vtype &b, const vtype &c) \
+    {                                                                 \
+        return prefix##_##postfix(a, b, c);                           \
+    }
+
+VBSL_IMPL(uint8_t, uint8x8_t, uint8x8_t, vbsl, u8)
+VBSL_IMPL(int8_t, int8x8_t, uint8x8_t, vbsl, s8)
+VBSL_IMPL(uint16_t, uint16x4_t, uint16x4_t, vbsl, u16)
+VBSL_IMPL(int16_t, int16x4_t, uint16x4_t, vbsl, s16)
+VBSL_IMPL(uint32_t, uint32x2_t, uint32x2_t, vbsl, u32)
+VBSL_IMPL(int32_t, int32x2_t, uint32x2_t, vbsl, s32)
+VBSL_IMPL(float32x2_t, float32x2_t, uint32x2_t, vbsl, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VBSL_IMPL(float16x4_t, float16x4_t, uint16x4_t, vbsl, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VBSL_IMPL(uint8_t, uint8x16_t, uint8x16_t, vbslq, u8)
+VBSL_IMPL(int8_t, int8x16_t, uint8x16_t, vbslq, s8)
+VBSL_IMPL(uint16_t, uint16x8_t, uint16x8_t, vbslq, u16)
+VBSL_IMPL(int16_t, int16x8_t, uint16x8_t, vbslq, s16)
+VBSL_IMPL(uint32_t, uint32x4_t, uint32x4_t, vbslq, u32)
+VBSL_IMPL(int32_t, int32x4_t, uint32x4_t, vbslq, s32)
+VBSL_IMPL(float32x4_t, float32x4_t, uint32x4_t, vbslq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VBSL_IMPL(float16x8_t, float16x8_t, uint16x8_t, vbslq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VBSL_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_BSL_H */
diff --git a/src/core/NEON/wrapper/intrinsics/ceq.h b/src/core/NEON/wrapper/intrinsics/ceq.h
new file mode 100644
index 0000000..b0324e6
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/ceq.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_CEQ_H
+#define ARM_COMPUTE_WRAPPER_CEQ_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VCEQ_IMPL(votype, vtype, prefix, postfix)      \
+    inline votype vceq(const vtype &a, const vtype &b) \
+    {                                                  \
+        return prefix##_##postfix(a, b);               \
+    }
+
+VCEQ_IMPL(uint8x8_t, uint8x8_t, vceq, u8)
+VCEQ_IMPL(uint8x8_t, int8x8_t, vceq, s8)
+VCEQ_IMPL(uint16x4_t, uint16x4_t, vceq, u16)
+VCEQ_IMPL(uint16x4_t, int16x4_t, vceq, s16)
+VCEQ_IMPL(uint32x2_t, uint32x2_t, vceq, u32)
+VCEQ_IMPL(uint32x2_t, int32x2_t, vceq, s32)
+VCEQ_IMPL(uint32x2_t, float32x2_t, vceq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCEQ_IMPL(uint16x4_t, float16x4_t, vceq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VCEQ_IMPL(uint8x16_t, uint8x16_t, vceqq, u8)
+VCEQ_IMPL(uint8x16_t, int8x16_t, vceqq, s8)
+VCEQ_IMPL(uint16x8_t, uint16x8_t, vceqq, u16)
+VCEQ_IMPL(uint16x8_t, int16x8_t, vceqq, s16)
+VCEQ_IMPL(uint32x4_t, uint32x4_t, vceqq, u32)
+VCEQ_IMPL(uint32x4_t, int32x4_t, vceqq, s32)
+VCEQ_IMPL(uint32x4_t, float32x4_t, vceqq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCEQ_IMPL(uint16x8_t, float16x8_t, vceqq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VCEQ_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_CEQ_H */
diff --git a/src/core/NEON/wrapper/intrinsics/cge.h b/src/core/NEON/wrapper/intrinsics/cge.h
new file mode 100644
index 0000000..e4a7fcd
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/cge.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_CGE_H
+#define ARM_COMPUTE_WRAPPER_CGE_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VCGE_IMPL(stype, vtype, rtype, prefix, postfix) \
+    inline rtype vcge(const vtype &a, const vtype &b)   \
+    {                                                   \
+        return prefix##_##postfix(a, b);                \
+    }
+
+VCGE_IMPL(uint8_t, uint8x8_t, uint8x8_t, vcge, u8)
+VCGE_IMPL(int8_t, int8x8_t, uint8x8_t, vcge, s8)
+VCGE_IMPL(uint16_t, uint16x4_t, uint16x4_t, vcge, u16)
+VCGE_IMPL(int16_t, int16x4_t, uint16x4_t, vcge, s16)
+VCGE_IMPL(uint32_t, uint32x2_t, uint32x2_t, vcge, u32)
+VCGE_IMPL(int32_t, int32x2_t, uint32x2_t, vcge, s32)
+VCGE_IMPL(float32x2_t, float32x2_t, uint32x2_t, vcge, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCGE_IMPL(float16x4_t, float16x4_t, uint16x4_t, vcge, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VCGE_IMPL(uint8_t, uint8x16_t, uint8x16_t, vcgeq, u8)
+VCGE_IMPL(int8_t, int8x16_t, uint8x16_t, vcgeq, s8)
+VCGE_IMPL(uint16_t, uint16x8_t, uint16x8_t, vcgeq, u16)
+VCGE_IMPL(int16_t, int16x8_t, uint16x8_t, vcgeq, s16)
+VCGE_IMPL(uint32_t, uint32x4_t, uint32x4_t, vcgeq, u32)
+VCGE_IMPL(int32_t, int32x4_t, uint32x4_t, vcgeq, s32)
+VCGE_IMPL(float32x4_t, float32x4_t, uint32x4_t, vcgeq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCGE_IMPL(float16x8_t, float16x8_t, uint16x8_t, vcgeq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VCGE_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_CGE_H */
diff --git a/src/core/NEON/wrapper/intrinsics/cgt.h b/src/core/NEON/wrapper/intrinsics/cgt.h
new file mode 100644
index 0000000..f34d02f
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/cgt.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_CGT_H
+#define ARM_COMPUTE_WRAPPER_CGT_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VCGT_IMPL(rtype, vtype, prefix, postfix)      \
+    inline rtype vcgt(const vtype &a, const vtype &b) \
+    {                                                 \
+        return prefix##_##postfix(a, b);              \
+    }
+
+VCGT_IMPL(uint8x8_t, uint8x8_t, vcgt, u8)
+VCGT_IMPL(uint8x8_t, int8x8_t, vcgt, s8)
+VCGT_IMPL(uint16x4_t, uint16x4_t, vcgt, u16)
+VCGT_IMPL(uint16x4_t, int16x4_t, vcgt, s16)
+VCGT_IMPL(uint32x2_t, uint32x2_t, vcgt, u32)
+VCGT_IMPL(uint32x2_t, int32x2_t, vcgt, s32)
+VCGT_IMPL(uint32x2_t, float32x2_t, vcgt, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCGT_IMPL(uint16x4_t, float16x4_t, vcgt, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VCGT_IMPL(uint8x16_t, uint8x16_t, vcgtq, u8)
+VCGT_IMPL(uint8x16_t, int8x16_t, vcgtq, s8)
+VCGT_IMPL(uint16x8_t, uint16x8_t, vcgtq, u16)
+VCGT_IMPL(uint16x8_t, int16x8_t, vcgtq, s16)
+VCGT_IMPL(uint32x4_t, uint32x4_t, vcgtq, u32)
+VCGT_IMPL(uint32x4_t, int32x4_t, vcgtq, s32)
+VCGT_IMPL(uint32x4_t, float32x4_t, vcgtq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCGT_IMPL(uint16x8_t, float16x8_t, vcgtq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VCGT_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_CGT_H */
diff --git a/src/core/NEON/wrapper/intrinsics/cle.h b/src/core/NEON/wrapper/intrinsics/cle.h
new file mode 100644
index 0000000..50c175f
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/cle.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_CLE_H
+#define ARM_COMPUTE_WRAPPER_CLE_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VCLE_IMPL(stype, vtype, rtype, prefix, postfix) \
+    inline rtype vcle(const vtype &a, const vtype &b)   \
+    {                                                   \
+        return prefix##_##postfix(a, b);                \
+    }
+
+VCLE_IMPL(uint8_t, uint8x8_t, uint8x8_t, vcle, u8)
+VCLE_IMPL(int8_t, int8x8_t, uint8x8_t, vcle, s8)
+VCLE_IMPL(uint16_t, uint16x4_t, uint16x4_t, vcle, u16)
+VCLE_IMPL(int16_t, int16x4_t, uint16x4_t, vcle, s16)
+VCLE_IMPL(uint32_t, uint32x2_t, uint32x2_t, vcle, u32)
+VCLE_IMPL(int32_t, int32x2_t, uint32x2_t, vcle, s32)
+VCLE_IMPL(float32x2_t, float32x2_t, uint32x2_t, vcle, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCLE_IMPL(float16x4_t, float16x4_t, uint16x4_t, vcle, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VCLE_IMPL(uint8_t, uint8x16_t, uint8x16_t, vcleq, u8)
+VCLE_IMPL(int8_t, int8x16_t, uint8x16_t, vcleq, s8)
+VCLE_IMPL(uint16_t, uint16x8_t, uint16x8_t, vcleq, u16)
+VCLE_IMPL(int16_t, int16x8_t, uint16x8_t, vcleq, s16)
+VCLE_IMPL(uint32_t, uint32x4_t, uint32x4_t, vcleq, u32)
+VCLE_IMPL(int32_t, int32x4_t, uint32x4_t, vcleq, s32)
+VCLE_IMPL(float32x4_t, float32x4_t, uint32x4_t, vcleq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCLE_IMPL(float16x8_t, float16x8_t, uint16x8_t, vcleq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VCLE_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_CLE_H */
diff --git a/src/core/NEON/wrapper/intrinsics/clt.h b/src/core/NEON/wrapper/intrinsics/clt.h
new file mode 100644
index 0000000..10fd320
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/clt.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_CLT_H
+#define ARM_COMPUTE_WRAPPER_CLT_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VCLT_IMPL(votype, vtype, prefix, postfix)      \
+    inline votype vclt(const vtype &a, const vtype &b) \
+    {                                                  \
+        return prefix##_##postfix(a, b);               \
+    }
+
+VCLT_IMPL(uint8x8_t, uint8x8_t, vclt, u8)
+VCLT_IMPL(uint8x8_t, int8x8_t, vclt, s8)
+VCLT_IMPL(uint16x4_t, uint16x4_t, vclt, u16)
+VCLT_IMPL(uint16x4_t, int16x4_t, vclt, s16)
+VCLT_IMPL(uint32x2_t, uint32x2_t, vclt, u32)
+VCLT_IMPL(uint32x2_t, int32x2_t, vclt, s32)
+VCLT_IMPL(uint32x2_t, float32x2_t, vclt, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCLT_IMPL(uint16x4_t, float16x4_t, vclt, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VCLT_IMPL(uint8x16_t, uint8x16_t, vcltq, u8)
+VCLT_IMPL(uint8x16_t, int8x16_t, vcltq, s8)
+VCLT_IMPL(uint16x8_t, uint16x8_t, vcltq, u16)
+VCLT_IMPL(uint16x8_t, int16x8_t, vcltq, s16)
+VCLT_IMPL(uint32x4_t, uint32x4_t, vcltq, u32)
+VCLT_IMPL(uint32x4_t, int32x4_t, vcltq, s32)
+VCLT_IMPL(uint32x4_t, float32x4_t, vcltq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCLT_IMPL(uint16x8_t, float16x8_t, vcltq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VCLT_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_CLT_H */
diff --git a/src/core/NEON/wrapper/intrinsics/combine.h b/src/core/NEON/wrapper/intrinsics/combine.h
new file mode 100644
index 0000000..8b6a588
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/combine.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_COMBINE_H
+#define ARM_COMPUTE_WRAPPER_COMBINE_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VCOMBINE_IMPL(rtype, vtype, prefix, postfix)      \
+    inline rtype vcombine(const vtype &a, const vtype &b) \
+    {                                                     \
+        return prefix##_##postfix(a, b);                  \
+    }
+
+VCOMBINE_IMPL(uint8x16_t, uint8x8_t, vcombine, u8)
+VCOMBINE_IMPL(int8x16_t, int8x8_t, vcombine, s8)
+VCOMBINE_IMPL(uint16x8_t, uint16x4_t, vcombine, u16)
+VCOMBINE_IMPL(int16x8_t, int16x4_t, vcombine, s16)
+VCOMBINE_IMPL(uint32x4_t, uint32x2_t, vcombine, u32)
+VCOMBINE_IMPL(int32x4_t, int32x2_t, vcombine, s32)
+VCOMBINE_IMPL(float32x4_t, float32x2_t, vcombine, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCOMBINE_IMPL(float16x8_t, float16x4_t, vcombine, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VCOMBINE_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_COMBINE_H */
diff --git a/src/core/NEON/wrapper/intrinsics/cvt.h b/src/core/NEON/wrapper/intrinsics/cvt.h
new file mode 100644
index 0000000..6e79a92
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/cvt.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_CVT_H
+#define ARM_COMPUTE_WRAPPER_CVT_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VCVT_TO_F32_IMPL(ptype, vtype, prefix, postfix1, postfix2)                   \
+    template <typename T>                                                            \
+    inline typename std::enable_if<std::is_same<T, float>::value, float32x4_t>::type \
+    vcvt(const vtype &a)                                                             \
+    {                                                                                \
+        return prefix##_##postfix1##_##postfix2(a);                                  \
+    }
+
+VCVT_TO_F32_IMPL(float32x4_t, uint32x4_t, vcvtq, f32, u32)
+VCVT_TO_F32_IMPL(float32x4_t, int32x4_t, vcvtq, f32, s32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCVT_TO_F32_IMPL(float32x4_t, float16x4_t, vcvt, f32, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#undef VCVT_TO_F32_IMPL
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#define VCVT_TO_F16_IMPL(ptype, vtype, prefix, postfix1, postfix2)                       \
+    template <typename T>                                                                \
+    inline typename std::enable_if<std::is_same<T, float16_t>::value, float16x4_t>::type \
+    vcvt(const vtype &a)                                                                 \
+    {                                                                                    \
+        return prefix##_##postfix1##_##postfix2(a);                                      \
+    }
+
+VCVT_TO_F16_IMPL(float16x4_t, float32x4_t, vcvt, f16, f32)
+#undef VCVT_TO_F16_IMPL
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+template <typename T>
+inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint32x4_t>::type
+vcvt(const float32x4_t &a)
+{
+    return vcvtq_u32_f32(a);
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_same<T, int8_t>::value, int32x4_t>::type
+vcvt(const float32x4_t &a)
+{
+    return vcvtq_s32_f32(a);
+}
+
+#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
+/** Convert 2x128-bit floating point vectors into 1x128-bit bfloat16 vector
+ *
+ * @param[in]     inptr  Pointer to the input memory to load values from
+ * @param[in,out] outptr Pointer to the output memory to store values to
+ */
+inline void vcvt_bf16_f32(const float *inptr, uint16_t *outptr)
+{
+    __asm __volatile(
+        "ldp    q0, q1, [%[inptr]]\n"
+        ".inst  0xea16800\n"  // BFCVTN v0, v0
+        ".inst  0x4ea16820\n" // BFCVTN2 v0, v1
+        "str    q0, [%[outptr]]\n"
+        : [inptr] "+r"(inptr)
+        : [outptr] "r"(outptr)
+        : "v0", "v1", "memory");
+}
+#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
+
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_CVT_H */
diff --git a/src/core/NEON/wrapper/intrinsics/div.h b/src/core/NEON/wrapper/intrinsics/div.h
new file mode 100644
index 0000000..265f30d
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/div.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_DIV_H
+#define ARM_COMPUTE_WRAPPER_DIV_H
+
+#include "src/core/NEON/NEMath.h"
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#ifdef __aarch64__
+
+#define VDIV_IMPL(stype, vtype, prefix, postfix)      \
+    inline vtype vdiv(const vtype &a, const vtype &b) \
+    {                                                 \
+        return prefix##_##postfix(a, b);              \
+    }
+VDIV_IMPL(float32x2_t, float32x2_t, vdiv, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VDIV_IMPL(float16x4_t, float16x4_t, vdiv, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VDIV_IMPL(float32x4_t, float32x4_t, vdivq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VDIV_IMPL(float16x8_t, float16x8_t, vdivq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#else // __aarch64__
+
+#define VDIV_IMPL(stype, vtype, mul_prefix, inv_prefix, postfix)     \
+    inline vtype vdiv(const vtype &a, const vtype &b)                \
+    {                                                                \
+        return mul_prefix##_##postfix(a, inv_prefix##_##postfix(b)); \
+    }
+VDIV_IMPL(float32x2_t, float32x2_t, vmul, vinv, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VDIV_IMPL(float16x4_t, float16x4_t, vmul, vinv, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VDIV_IMPL(float32x4_t, float32x4_t, vmulq, vinvq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VDIV_IMPL(float16x8_t, float16x8_t, vmulq, vinvq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#endif // __aarch64__
+
+#undef VDIV_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_DIV_H */
diff --git a/src/core/NEON/wrapper/intrinsics/dup_n.h b/src/core/NEON/wrapper/intrinsics/dup_n.h
new file mode 100644
index 0000000..e745aa4
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/dup_n.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_DUP_N_H
+#define ARM_COMPUTE_WRAPPER_DUP_N_H
+
+#include "src/core/NEON/wrapper/traits.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VDUP_N_IMPL(stype, vtype, prefix, postfix, tag) \
+    inline vtype vdup_n(stype value, tag)               \
+    {                                                   \
+        return prefix##_##postfix(value);               \
+    }
+
+VDUP_N_IMPL(uint8_t, uint8x8_t, vdup_n, u8, traits::vector_64_tag)
+VDUP_N_IMPL(int8_t, int8x8_t, vdup_n, s8, traits::vector_64_tag)
+VDUP_N_IMPL(uint16_t, uint16x4_t, vdup_n, u16, traits::vector_64_tag)
+VDUP_N_IMPL(int16_t, int16x4_t, vdup_n, s16, traits::vector_64_tag)
+VDUP_N_IMPL(uint32_t, uint32x2_t, vdup_n, u32, traits::vector_64_tag)
+VDUP_N_IMPL(int32_t, int32x2_t, vdup_n, s32, traits::vector_64_tag)
+VDUP_N_IMPL(float, float32x2_t, vdup_n, f32, traits::vector_64_tag)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VDUP_N_IMPL(float16_t, float16x4_t, vdup_n, f16, traits::vector_64_tag)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VDUP_N_IMPL(uint8_t, uint8x16_t, vdupq_n, u8, traits::vector_128_tag)
+VDUP_N_IMPL(int8_t, int8x16_t, vdupq_n, s8, traits::vector_128_tag)
+VDUP_N_IMPL(uint16_t, uint16x8_t, vdupq_n, u16, traits::vector_128_tag)
+VDUP_N_IMPL(int16_t, int16x8_t, vdupq_n, s16, traits::vector_128_tag)
+VDUP_N_IMPL(uint32_t, uint32x4_t, vdupq_n, u32, traits::vector_128_tag)
+VDUP_N_IMPL(int32_t, int32x4_t, vdupq_n, s32, traits::vector_128_tag)
+VDUP_N_IMPL(float, float32x4_t, vdupq_n, f32, traits::vector_128_tag)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VDUP_N_IMPL(float16_t, float16x8_t, vdupq_n, f16, traits::vector_128_tag)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VDUP_N_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_DUP_N_H */
diff --git a/src/core/NEON/wrapper/intrinsics/eor.h b/src/core/NEON/wrapper/intrinsics/eor.h
new file mode 100644
index 0000000..ce88cf5
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/eor.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_EOR_H
+#define ARM_COMPUTE_WRAPPER_EOR_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VEOR_IMPL(vtype, prefix, postfix)             \
+    inline vtype veor(const vtype &a, const vtype &b) \
+    {                                                 \
+        return prefix##_##postfix(a, b);              \
+    }
+
+VEOR_IMPL(uint8x8_t, veor, u8)
+VEOR_IMPL(int8x8_t, veor, s8)
+VEOR_IMPL(uint16x4_t, veor, u16)
+VEOR_IMPL(int16x4_t, veor, s16)
+VEOR_IMPL(uint32x2_t, veor, u32)
+VEOR_IMPL(int32x2_t, veor, s32)
+
+VEOR_IMPL(uint8x16_t, veorq, u8)
+VEOR_IMPL(int8x16_t, veorq, s8)
+VEOR_IMPL(uint16x8_t, veorq, u16)
+VEOR_IMPL(int16x8_t, veorq, s16)
+VEOR_IMPL(uint32x4_t, veorq, u32)
+VEOR_IMPL(int32x4_t, veorq, s32)
+
+#undef VEOR_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_EOR_H */
diff --git a/src/core/NEON/wrapper/intrinsics/exp.h b/src/core/NEON/wrapper/intrinsics/exp.h
new file mode 100644
index 0000000..c2a6970
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/exp.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_EXP_H
+#define ARM_COMPUTE_WRAPPER_EXP_H
+
+#include "src/core/NEON/NEMath.h"
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VEXPQ_IMPL(vtype, postfix)     \
+    inline vtype vexpq(const vtype &a) \
+    {                                  \
+        return vexpq_##postfix(a);     \
+    }
+
+#define VEXPQ_IMPL_INT(vtype, postfix)      \
+    inline vtype vexpq(const vtype &a)      \
+    {                                       \
+        ARM_COMPUTE_UNUSED(a);              \
+        ARM_COMPUTE_ERROR("Not supported"); \
+    }
+
+VEXPQ_IMPL(float32x4_t, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VEXPQ_IMPL(float16x8_t, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VEXPQ_IMPL_INT(int32x4_t, s32)
+#undef VEXPQ_IMPL
+
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_EXP_H */
diff --git a/src/core/NEON/wrapper/intrinsics/ext.h b/src/core/NEON/wrapper/intrinsics/ext.h
new file mode 100644
index 0000000..d44b231
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/ext.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_EXT_H
+#define ARM_COMPUTE_WRAPPER_EXT_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VEXT_IMPL(vtype, prefix, postfix, size)            \
+    inline vtype vext_##size(vtype value_a, vtype value_b) \
+    {                                                      \
+        return prefix##_##postfix(value_a, value_b, size); \
+    }
+
+VEXT_IMPL(uint8x8_t, vext, u8, 1)
+VEXT_IMPL(uint8x8_t, vext, u8, 2)
+VEXT_IMPL(int8x8_t, vext, s8, 1)
+VEXT_IMPL(int8x8_t, vext, s8, 2)
+VEXT_IMPL(uint16x4_t, vext, u16, 1)
+VEXT_IMPL(uint16x4_t, vext, u16, 2)
+VEXT_IMPL(int16x4_t, vext, s16, 1)
+VEXT_IMPL(int16x4_t, vext, s16, 2)
+
+VEXT_IMPL(uint8x16_t, vextq, u8, 1)
+VEXT_IMPL(uint8x16_t, vextq, u8, 2)
+VEXT_IMPL(int8x16_t, vextq, s8, 1)
+VEXT_IMPL(int8x16_t, vextq, s8, 2)
+VEXT_IMPL(uint16x8_t, vextq, u16, 1)
+VEXT_IMPL(uint16x8_t, vextq, u16, 2)
+VEXT_IMPL(int16x8_t, vextq, s16, 1)
+VEXT_IMPL(int16x8_t, vextq, s16, 2)
+VEXT_IMPL(int32x4_t, vextq, s32, 1)
+VEXT_IMPL(int32x4_t, vextq, s32, 2)
+
+#undef VEXT_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_EXT_H */
diff --git a/src/core/NEON/wrapper/intrinsics/gethigh.h b/src/core/NEON/wrapper/intrinsics/gethigh.h
new file mode 100644
index 0000000..d098a27
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/gethigh.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_GET_HIGH_H
+#define ARM_COMPUTE_WRAPPER_GET_HIGH_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VGETHIGH_IMPL(half_vtype, vtype, postfix) \
+    inline half_vtype vgethigh(const vtype val)   \
+    {                                             \
+        return vget_high_##postfix(val);          \
+    }
+
+VGETHIGH_IMPL(uint8x8_t, uint8x16_t, u8)
+VGETHIGH_IMPL(int8x8_t, int8x16_t, s8)
+VGETHIGH_IMPL(uint16x4_t, uint16x8_t, u16)
+VGETHIGH_IMPL(int16x4_t, int16x8_t, s16)
+VGETHIGH_IMPL(uint32x2_t, uint32x4_t, u32)
+VGETHIGH_IMPL(int32x2_t, int32x4_t, s32)
+VGETHIGH_IMPL(float32x2_t, float32x4_t, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VGETHIGH_IMPL(float16x4_t, float16x8_t, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VGETHIGH_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_GET_HIGH_H */
diff --git a/src/core/NEON/wrapper/intrinsics/getlane.h b/src/core/NEON/wrapper/intrinsics/getlane.h
new file mode 100644
index 0000000..2052751
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/getlane.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_GET_LANE_H
+#define ARM_COMPUTE_WRAPPER_GET_LANE_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VGETLANE_IMPL_8(stype, vtype, postfix)                         \
+    inline stype vgetlane(const vtype vector, const unsigned int lane) \
+    {                                                                  \
+        switch(lane)                                                   \
+        {                                                              \
+            case 0:                                                    \
+                return vget_lane_##postfix(vector, 0);                 \
+            case 1:                                                    \
+                return vget_lane_##postfix(vector, 1);                 \
+            case 2:                                                    \
+                return vget_lane_##postfix(vector, 2);                 \
+            case 3:                                                    \
+                return vget_lane_##postfix(vector, 3);                 \
+            case 4:                                                    \
+                return vget_lane_##postfix(vector, 4);                 \
+            case 5:                                                    \
+                return vget_lane_##postfix(vector, 5);                 \
+            case 6:                                                    \
+                return vget_lane_##postfix(vector, 6);                 \
+            case 7:                                                    \
+                return vget_lane_##postfix(vector, 7);                 \
+            default:                                                   \
+                ARM_COMPUTE_ERROR("Invalid lane");                     \
+        }                                                              \
+    }
+
+#define VGETLANE_IMPL_4(stype, vtype, postfix)                         \
+    inline stype vgetlane(const vtype vector, const unsigned int lane) \
+    {                                                                  \
+        switch(lane)                                                   \
+        {                                                              \
+            case 0:                                                    \
+                return vget_lane_##postfix(vector, 0);                 \
+            case 1:                                                    \
+                return vget_lane_##postfix(vector, 1);                 \
+            case 2:                                                    \
+                return vget_lane_##postfix(vector, 2);                 \
+            case 3:                                                    \
+                return vget_lane_##postfix(vector, 3);                 \
+            default:                                                   \
+                ARM_COMPUTE_ERROR("Invalid lane");                     \
+        }                                                              \
+    }
+
+#define VGETLANE_IMPL_2(stype, vtype, postfix)                         \
+    inline stype vgetlane(const vtype vector, const unsigned int lane) \
+    {                                                                  \
+        switch(lane)                                                   \
+        {                                                              \
+            case 0:                                                    \
+                return vget_lane_##postfix(vector, 0);                 \
+            case 1:                                                    \
+                return vget_lane_##postfix(vector, 1);                 \
+            default:                                                   \
+                ARM_COMPUTE_ERROR("Invalid lane");                     \
+        }                                                              \
+    }
+
+VGETLANE_IMPL_8(uint8_t, uint8x8_t, u8)
+VGETLANE_IMPL_8(int8_t, int8x8_t, s8)
+VGETLANE_IMPL_4(uint16_t, uint16x4_t, u16)
+VGETLANE_IMPL_4(int16_t, int16x4_t, s16)
+VGETLANE_IMPL_2(uint32_t, uint32x2_t, u32)
+VGETLANE_IMPL_2(int32_t, int32x2_t, s32)
+VGETLANE_IMPL_2(float, float32x2_t, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VGETLANE_IMPL_4(float16_t, float16x4_t, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#define VGETQLANE_IMPL_16(stype, vtype, postfix)                       \
+    inline stype vgetlane(const vtype vector, const unsigned int lane) \
+    {                                                                  \
+        switch(lane)                                                   \
+        {                                                              \
+            case 0:                                                    \
+                return vgetq_lane_##postfix(vector, 0);                \
+            case 1:                                                    \
+                return vgetq_lane_##postfix(vector, 1);                \
+            case 2:                                                    \
+                return vgetq_lane_##postfix(vector, 2);                \
+            case 3:                                                    \
+                return vgetq_lane_##postfix(vector, 3);                \
+            case 4:                                                    \
+                return vgetq_lane_##postfix(vector, 4);                \
+            case 5:                                                    \
+                return vgetq_lane_##postfix(vector, 5);                \
+            case 6:                                                    \
+                return vgetq_lane_##postfix(vector, 6);                \
+            case 7:                                                    \
+                return vgetq_lane_##postfix(vector, 7);                \
+            case 8:                                                    \
+                return vgetq_lane_##postfix(vector, 8);                \
+            case 9:                                                    \
+                return vgetq_lane_##postfix(vector, 9);                \
+            case 10:                                                   \
+                return vgetq_lane_##postfix(vector, 10);               \
+            case 11:                                                   \
+                return vgetq_lane_##postfix(vector, 11);               \
+            case 12:                                                   \
+                return vgetq_lane_##postfix(vector, 12);               \
+            case 13:                                                   \
+                return vgetq_lane_##postfix(vector, 13);               \
+            case 14:                                                   \
+                return vgetq_lane_##postfix(vector, 14);               \
+            case 15:                                                   \
+                return vgetq_lane_##postfix(vector, 15);               \
+            default:                                                   \
+                ARM_COMPUTE_ERROR("Invalid lane");                     \
+        }                                                              \
+    }
+
+#define VGETQLANE_IMPL_8(stype, vtype, postfix)                        \
+    inline stype vgetlane(const vtype vector, const unsigned int lane) \
+    {                                                                  \
+        switch(lane)                                                   \
+        {                                                              \
+            case 0:                                                    \
+                return vgetq_lane_##postfix(vector, 0);                \
+            case 1:                                                    \
+                return vgetq_lane_##postfix(vector, 1);                \
+            case 2:                                                    \
+                return vgetq_lane_##postfix(vector, 2);                \
+            case 3:                                                    \
+                return vgetq_lane_##postfix(vector, 3);                \
+            case 4:                                                    \
+                return vgetq_lane_##postfix(vector, 4);                \
+            case 5:                                                    \
+                return vgetq_lane_##postfix(vector, 5);                \
+            case 6:                                                    \
+                return vgetq_lane_##postfix(vector, 6);                \
+            case 7:                                                    \
+                return vgetq_lane_##postfix(vector, 7);                \
+            default:                                                   \
+                ARM_COMPUTE_ERROR("Invalid lane");                     \
+        }                                                              \
+    }
+
+#define VGETQLANE_IMPL_4(stype, vtype, postfix)                        \
+    inline stype vgetlane(const vtype vector, const unsigned int lane) \
+    {                                                                  \
+        switch(lane)                                                   \
+        {                                                              \
+            case 0:                                                    \
+                return vgetq_lane_##postfix(vector, 0);                \
+            case 1:                                                    \
+                return vgetq_lane_##postfix(vector, 1);                \
+            case 2:                                                    \
+                return vgetq_lane_##postfix(vector, 2);                \
+            case 3:                                                    \
+                return vgetq_lane_##postfix(vector, 3);                \
+            default:                                                   \
+                ARM_COMPUTE_ERROR("Invalid lane");                     \
+        }                                                              \
+    }
+
+#define VGETQLANE_IMPL_2(stype, vtype, postfix)                        \
+    inline stype vgetlane(const vtype vector, const unsigned int lane) \
+    {                                                                  \
+        switch(lane)                                                   \
+        {                                                              \
+            case 0:                                                    \
+                return vgetq_lane_##postfix(vector, 0);                \
+            case 1:                                                    \
+                return vgetq_lane_##postfix(vector, 1);                \
+            default:                                                   \
+                ARM_COMPUTE_ERROR("Invalid lane");                     \
+        }                                                              \
+    }
+
+VGETQLANE_IMPL_16(uint8_t, uint8x16_t, u8)
+VGETQLANE_IMPL_16(int8_t, int8x16_t, s8)
+VGETQLANE_IMPL_8(uint16_t, uint16x8_t, u16)
+VGETQLANE_IMPL_8(int16_t, int16x8_t, s16)
+VGETQLANE_IMPL_4(uint32_t, uint32x4_t, u32)
+VGETQLANE_IMPL_4(int32_t, int32x4_t, s32)
+VGETQLANE_IMPL_4(float, float32x4_t, f32)
+VGETQLANE_IMPL_2(int64_t, int64x2_t, s64)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VGETQLANE_IMPL_8(float16_t, float16x8_t, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VGETLANE_IMPL_8
+#undef VGETLANE_IMPL_4
+#undef VGETLANE_IMPL_2
+
+#undef VGETQLANE_IMPL_16
+#undef VGETQLANE_IMPL_8
+#undef VGETQLANE_IMPL_4
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_GET_LANE_H */
diff --git a/src/core/NEON/wrapper/intrinsics/getlow.h b/src/core/NEON/wrapper/intrinsics/getlow.h
new file mode 100644
index 0000000..b5469f0
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/getlow.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_GET_LOW_H
+#define ARM_COMPUTE_WRAPPER_GET_LOW_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VGETLOW_IMPL(half_vtype, vtype, postfix) \
+    inline half_vtype vgetlow(const vtype val)   \
+    {                                            \
+        return vget_low_##postfix(val);          \
+    }
+
+VGETLOW_IMPL(uint8x8_t, uint8x16_t, u8)
+VGETLOW_IMPL(int8x8_t, int8x16_t, s8)
+VGETLOW_IMPL(uint16x4_t, uint16x8_t, u16)
+VGETLOW_IMPL(int16x4_t, int16x8_t, s16)
+VGETLOW_IMPL(uint32x2_t, uint32x4_t, u32)
+VGETLOW_IMPL(int32x2_t, int32x4_t, s32)
+VGETLOW_IMPL(float32x2_t, float32x4_t, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VGETLOW_IMPL(float16x4_t, float16x8_t, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VGETLOW_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_GET_LOW_H */
diff --git a/src/core/NEON/wrapper/intrinsics/intrinsics.h b/src/core/NEON/wrapper/intrinsics/intrinsics.h
new file mode 100644
index 0000000..495321a
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/intrinsics.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_INTRINSICS_H
+#define ARM_COMPUTE_WRAPPER_INTRINSICS_H
+
+#include "src/core/NEON/wrapper/intrinsics/abs.h"
+#include "src/core/NEON/wrapper/intrinsics/add.h"
+#include "src/core/NEON/wrapper/intrinsics/and.h"
+#include "src/core/NEON/wrapper/intrinsics/bsl.h"
+#include "src/core/NEON/wrapper/intrinsics/ceq.h"
+#include "src/core/NEON/wrapper/intrinsics/cge.h"
+#include "src/core/NEON/wrapper/intrinsics/cgt.h"
+#include "src/core/NEON/wrapper/intrinsics/cle.h"
+#include "src/core/NEON/wrapper/intrinsics/clt.h"
+#include "src/core/NEON/wrapper/intrinsics/combine.h"
+#include "src/core/NEON/wrapper/intrinsics/cvt.h"
+#include "src/core/NEON/wrapper/intrinsics/div.h"
+#include "src/core/NEON/wrapper/intrinsics/dup_n.h"
+#include "src/core/NEON/wrapper/intrinsics/eor.h"
+#include "src/core/NEON/wrapper/intrinsics/exp.h"
+#include "src/core/NEON/wrapper/intrinsics/ext.h"
+#include "src/core/NEON/wrapper/intrinsics/gethigh.h"
+#include "src/core/NEON/wrapper/intrinsics/getlane.h"
+#include "src/core/NEON/wrapper/intrinsics/getlow.h"
+#include "src/core/NEON/wrapper/intrinsics/inv.h"
+#include "src/core/NEON/wrapper/intrinsics/invsqrt.h"
+#include "src/core/NEON/wrapper/intrinsics/load.h"
+#include "src/core/NEON/wrapper/intrinsics/log.h"
+#include "src/core/NEON/wrapper/intrinsics/max.h"
+#include "src/core/NEON/wrapper/intrinsics/min.h"
+#include "src/core/NEON/wrapper/intrinsics/mla.h"
+#include "src/core/NEON/wrapper/intrinsics/movl.h"
+#include "src/core/NEON/wrapper/intrinsics/movn.h"
+#include "src/core/NEON/wrapper/intrinsics/mul.h"
+#include "src/core/NEON/wrapper/intrinsics/neg.h"
+#include "src/core/NEON/wrapper/intrinsics/not.h"
+#include "src/core/NEON/wrapper/intrinsics/orr.h"
+#include "src/core/NEON/wrapper/intrinsics/pmax.h"
+#include "src/core/NEON/wrapper/intrinsics/pmin.h"
+#include "src/core/NEON/wrapper/intrinsics/pow.h"
+#include "src/core/NEON/wrapper/intrinsics/qmov.h"
+#include "src/core/NEON/wrapper/intrinsics/qmovun.h"
+#include "src/core/NEON/wrapper/intrinsics/reinterpret.h"
+#include "src/core/NEON/wrapper/intrinsics/rev64.h"
+#include "src/core/NEON/wrapper/intrinsics/round.h"
+#include "src/core/NEON/wrapper/intrinsics/setlane.h"
+#include "src/core/NEON/wrapper/intrinsics/sin.h"
+#include "src/core/NEON/wrapper/intrinsics/store.h"
+#include "src/core/NEON/wrapper/intrinsics/sub.h"
+#include "src/core/NEON/wrapper/intrinsics/tanh.h"
+#include "src/core/NEON/wrapper/intrinsics/tbl.h"
+
+#endif /* ARM_COMPUTE_WRAPPER_INTRINSICS_H */
diff --git a/src/core/NEON/wrapper/intrinsics/inv.h b/src/core/NEON/wrapper/intrinsics/inv.h
new file mode 100644
index 0000000..de398b0
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/inv.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_INV_H
+#define ARM_COMPUTE_WRAPPER_INV_H
+
+#include "src/core/NEON/NEMath.h"
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VINV_IMPL(vtype, prefix, postfix) \
+    inline vtype vinv(const vtype &a)     \
+    {                                     \
+        return prefix##_##postfix(a);     \
+    }
+
+#define VINV_IMPL_INT(vtype, prefix, postfix) \
+    inline vtype vinv(const vtype &a)         \
+    {                                         \
+        ARM_COMPUTE_UNUSED(a);                \
+        ARM_COMPUTE_ERROR("Not supported");   \
+    }
+
+VINV_IMPL(float32x2_t, vinv, f32)
+VINV_IMPL_INT(int32x2_t, vinv, s32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VINV_IMPL(float16x4_t, vinv, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VINV_IMPL(float32x4_t, vinvq, f32)
+VINV_IMPL_INT(int32x4_t, vinvq, s32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VINV_IMPL(float16x8_t, vinvq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VINV_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_INV_H */
diff --git a/src/core/NEON/wrapper/intrinsics/invsqrt.h b/src/core/NEON/wrapper/intrinsics/invsqrt.h
new file mode 100644
index 0000000..2343efa
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/invsqrt.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_INVSQRT_H
+#define ARM_COMPUTE_WRAPPER_INVSQRT_H
+
+#include "src/core/NEON/NEMath.h"
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VINVSQRT_IMPL(stype, vtype, prefix, postfix) \
+    inline vtype vinvsqrt(const vtype &a)            \
+    {                                                \
+        return prefix##_##postfix(a);                \
+    }
+
+#define VINVSQRT_IMPL_INT(stype, vtype, prefix, postfix) \
+    inline vtype vinvsqrt(const vtype &a)                \
+    {                                                    \
+        ARM_COMPUTE_UNUSED(a);                           \
+        ARM_COMPUTE_ERROR("Not supported");              \
+    }
+
+VINVSQRT_IMPL(float, float32x2_t, vinvsqrt, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VINVSQRT_IMPL(float16_t, float16x4_t, vinvsqrt, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VINVSQRT_IMPL_INT(int, int32x4_t, vinvsqrt, s32)
+
+VINVSQRT_IMPL(float, float32x4_t, vinvsqrtq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VINVSQRT_IMPL(float16_t, float16x8_t, vinvsqrtq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VINVSQRT_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_INVSQRT_H */
diff --git a/src/core/NEON/wrapper/intrinsics/load.h b/src/core/NEON/wrapper/intrinsics/load.h
new file mode 100644
index 0000000..a2116c0
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/load.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_LOAD_H
+#define ARM_COMPUTE_WRAPPER_LOAD_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VLOAD_IMPL(stype, vtype, postfix) \
+    inline vtype vload(const stype *ptr)  \
+    {                                     \
+        return vld1_##postfix(ptr);       \
+    }
+
+VLOAD_IMPL(uint8_t, uint8x8_t, u8)
+VLOAD_IMPL(int8_t, int8x8_t, s8)
+VLOAD_IMPL(uint16_t, uint16x4_t, u16)
+VLOAD_IMPL(int16_t, int16x4_t, s16)
+VLOAD_IMPL(uint32_t, uint32x2_t, u32)
+VLOAD_IMPL(int32_t, int32x2_t, s32)
+//VLOAD_IMPL(uint64_t, uint64x1_t, u64)
+//VLOAD_IMPL(int64_t, int64x1_t, s64)
+VLOAD_IMPL(float, float32x2_t, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VLOAD_IMPL(float16_t, float16x4_t, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#define VLOADQ_IMPL(stype, vtype, postfix) \
+    inline vtype vloadq(const stype *ptr)  \
+    {                                      \
+        return vld1q_##postfix(ptr);       \
+    }
+
+VLOADQ_IMPL(uint8_t, uint8x16_t, u8)
+VLOADQ_IMPL(int8_t, int8x16_t, s8)
+VLOADQ_IMPL(uint16_t, uint16x8_t, u16)
+VLOADQ_IMPL(int16_t, int16x8_t, s16)
+VLOADQ_IMPL(uint32_t, uint32x4_t, u32)
+VLOADQ_IMPL(int32_t, int32x4_t, s32)
+//VLOAD_IMPL(uint64_t, uint64x1_t, u64)
+//VLOAD_IMPL(int64_t, int64x1_t, s64)
+VLOADQ_IMPL(float, float32x4_t, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VLOADQ_IMPL(float16_t, float16x8_t, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#undef VLOAD_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_LOAD_H */
diff --git a/src/core/NEON/wrapper/intrinsics/log.h b/src/core/NEON/wrapper/intrinsics/log.h
new file mode 100644
index 0000000..357a77c
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/log.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_LOG_H
+#define ARM_COMPUTE_WRAPPER_LOG_H
+
+#include "src/core/NEON/NEMath.h"
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VLOG_IMPL(vtype, prefix, postfix) \
+    inline vtype vlog(const vtype &a)     \
+    {                                     \
+        return prefix##_##postfix(a);     \
+    }
+
+#define VLOG_IMPL_INT(vtype, prefix, postfix) \
+    inline vtype vlog(const vtype &a)         \
+    {                                         \
+        ARM_COMPUTE_UNUSED(a);                \
+        ARM_COMPUTE_ERROR("Not supported");   \
+    }
+
+VLOG_IMPL(float32x4_t, vlogq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VLOG_IMPL(float16x8_t, vlogq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VLOG_IMPL_INT(int32x4_t, vlogq, s32)
+
+#undef VLOG_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_LOG_H */
diff --git a/src/core/NEON/wrapper/intrinsics/max.h b/src/core/NEON/wrapper/intrinsics/max.h
new file mode 100644
index 0000000..cec437d
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/max.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_MAX_H
+#define ARM_COMPUTE_WRAPPER_MAX_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VMAX_IMPL(stype, vtype, prefix, postfix)      \
+    inline vtype vmax(const vtype &a, const vtype &b) \
+    {                                                 \
+        return prefix##_##postfix(a, b);              \
+    }
+
+VMAX_IMPL(uint8_t, uint8x8_t, vmax, u8)
+VMAX_IMPL(int8_t, int8x8_t, vmax, s8)
+VMAX_IMPL(uint16_t, uint16x4_t, vmax, u16)
+VMAX_IMPL(int16_t, int16x4_t, vmax, s16)
+VMAX_IMPL(uint32_t, uint32x2_t, vmax, u32)
+VMAX_IMPL(int32_t, int32x2_t, vmax, s32)
+VMAX_IMPL(float, float32x2_t, vmax, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMAX_IMPL(float16_t, float16x4_t, vmax, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VMAX_IMPL(uint8_t, uint8x16_t, vmaxq, u8)
+VMAX_IMPL(int8_t, int8x16_t, vmaxq, s8)
+VMAX_IMPL(uint16_t, uint16x8_t, vmaxq, u16)
+VMAX_IMPL(int16_t, int16x8_t, vmaxq, s16)
+VMAX_IMPL(uint32_t, uint32x4_t, vmaxq, u32)
+VMAX_IMPL(int32_t, int32x4_t, vmaxq, s32)
+VMAX_IMPL(float, float32x4_t, vmaxq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMAX_IMPL(float16_t, float16x8_t, vmaxq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VMAX_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_MAX_H */
diff --git a/src/core/NEON/wrapper/intrinsics/min.h b/src/core/NEON/wrapper/intrinsics/min.h
new file mode 100644
index 0000000..8afcb3c
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/min.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_MIN_H
+#define ARM_COMPUTE_WRAPPER_MIN_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VMIN_IMPL(stype, vtype, prefix, postfix)      \
+    inline vtype vmin(const vtype &a, const vtype &b) \
+    {                                                 \
+        return prefix##_##postfix(a, b);              \
+    }
+
+VMIN_IMPL(uint8_t, uint8x8_t, vmin, u8)
+VMIN_IMPL(int8_t, int8x8_t, vmin, s8)
+VMIN_IMPL(uint16_t, uint16x4_t, vmin, u16)
+VMIN_IMPL(int16_t, int16x4_t, vmin, s16)
+VMIN_IMPL(uint32_t, uint32x2_t, vmin, u32)
+VMIN_IMPL(int32_t, int32x2_t, vmin, s32)
+VMIN_IMPL(float, float32x2_t, vmin, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMIN_IMPL(float16_t, float16x4_t, vmin, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VMIN_IMPL(uint8_t, uint8x16_t, vminq, u8)
+VMIN_IMPL(int8_t, int8x16_t, vminq, s8)
+VMIN_IMPL(uint16_t, uint16x8_t, vminq, u16)
+VMIN_IMPL(int16_t, int16x8_t, vminq, s16)
+VMIN_IMPL(uint32_t, uint32x4_t, vminq, u32)
+VMIN_IMPL(int32_t, int32x4_t, vminq, s32)
+VMIN_IMPL(float, float32x4_t, vminq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMIN_IMPL(float16_t, float16x8_t, vminq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VMIN_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_MIN_H */
diff --git a/src/core/NEON/wrapper/intrinsics/mla.h b/src/core/NEON/wrapper/intrinsics/mla.h
new file mode 100644
index 0000000..2b38b34
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/mla.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_MLA_H
+#define ARM_COMPUTE_WRAPPER_MLA_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VMLA_IMPL(stype, vtype, prefix, postfix)                      \
+    inline vtype vmla(const vtype &a, const vtype &b, const vtype &c) \
+    {                                                                 \
+        return prefix##_##postfix(a, b, c);                           \
+    }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#define VMLA_IMPL2(stype, vtype, prefix1, prefix2, postfix)           \
+    inline vtype vmla(const vtype &a, const vtype &b, const vtype &c) \
+    {                                                                 \
+        return prefix1##_##postfix(a, prefix2##_##postfix(b, c));     \
+    }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VMLA_IMPL(uint8x8_t, uint8x8_t, vmla, u8)
+VMLA_IMPL(int8x8_t, int8x8_t, vmla, s8)
+VMLA_IMPL(uint16x4_t, uint16x4_t, vmla, u16)
+VMLA_IMPL(int16x4_t, int16x4_t, vmla, s16)
+VMLA_IMPL(uint32x2_t, uint32x2_t, vmla, u32)
+VMLA_IMPL(int32x2_t, int32x2_t, vmla, s32)
+VMLA_IMPL(float32x2_t, float32x2_t, vmla, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMLA_IMPL2(float16x4_t, float16x4_t, vadd, vmul, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VMLA_IMPL(uint8x16_t, uint8x16_t, vmlaq, u8)
+VMLA_IMPL(int8x16_t, int8x16_t, vmlaq, s8)
+VMLA_IMPL(uint16x8_t, uint16x8_t, vmlaq, u16)
+VMLA_IMPL(int16x8_t, int16x8_t, vmlaq, s16)
+VMLA_IMPL(uint32x4_t, uint32x4_t, vmlaq, u32)
+VMLA_IMPL(int32x4_t, int32x4_t, vmlaq, s32)
+VMLA_IMPL(float32x4_t, float32x4_t, vmlaq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMLA_IMPL2(float16x8_t, float16x8_t, vaddq, vmulq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VMLA_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_MLA_H */
diff --git a/src/core/NEON/wrapper/intrinsics/movl.h b/src/core/NEON/wrapper/intrinsics/movl.h
new file mode 100644
index 0000000..99f2150
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/movl.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_MOVL_H
+#define ARM_COMPUTE_WRAPPER_MOVL_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VMOVL_IMPL(ptype, vtype, prefix, postfix) \
+    inline ptype vmovl(const vtype &a)            \
+    {                                             \
+        return prefix##_##postfix(a);             \
+    }
+
+VMOVL_IMPL(uint16x8_t, uint8x8_t, vmovl, u8)
+VMOVL_IMPL(int16x8_t, int8x8_t, vmovl, s8)
+VMOVL_IMPL(uint32x4_t, uint16x4_t, vmovl, u16)
+VMOVL_IMPL(int32x4_t, int16x4_t, vmovl, s16)
+VMOVL_IMPL(uint64x2_t, uint32x2_t, vmovl, u32)
+VMOVL_IMPL(int64x2_t, int32x2_t, vmovl, s32)
+
+#undef VMOVL_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_MOVL_H */
diff --git a/src/core/NEON/wrapper/intrinsics/movn.h b/src/core/NEON/wrapper/intrinsics/movn.h
new file mode 100644
index 0000000..460c277
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/movn.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_MOVN_H
+#define ARM_COMPUTE_WRAPPER_MOVN_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VMOVN_IMPL(dtype, vtype, prefix, postfix) \
+    inline dtype vmovn(const vtype &a)            \
+    {                                             \
+        return prefix##_##postfix(a);             \
+    }
+
+VMOVN_IMPL(uint32x2_t, uint64x2_t, vmovn, u64)
+VMOVN_IMPL(int32x2_t, int64x2_t, vmovn, s64)
+VMOVN_IMPL(uint16x4_t, uint32x4_t, vmovn, u32)
+VMOVN_IMPL(int16x4_t, int32x4_t, vmovn, s32)
+VMOVN_IMPL(uint8x8_t, uint16x8_t, vmovn, u16)
+VMOVN_IMPL(int8x8_t, int16x8_t, vmovn, s16)
+
+#define VQMOVN_IMPL(dtype, vtype, prefix, postfix) \
+    inline dtype vqmovn(const vtype &a)            \
+    {                                              \
+        return prefix##_##postfix(a);              \
+    }
+
+VQMOVN_IMPL(uint32x2_t, uint64x2_t, vqmovn, u64)
+VQMOVN_IMPL(int32x2_t, int64x2_t, vqmovn, s64)
+VQMOVN_IMPL(uint16x4_t, uint32x4_t, vqmovn, u32)
+VQMOVN_IMPL(int16x4_t, int32x4_t, vqmovn, s32)
+VQMOVN_IMPL(uint8x8_t, uint16x8_t, vqmovn, u16)
+VQMOVN_IMPL(int8x8_t, int16x8_t, vqmovn, s16)
+
+#undef VMOVN_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_MOVN_H */
diff --git a/src/core/NEON/wrapper/intrinsics/mul.h b/src/core/NEON/wrapper/intrinsics/mul.h
new file mode 100644
index 0000000..6296fff
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/mul.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_MUL_H
+#define ARM_COMPUTE_WRAPPER_MUL_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VMUL_IMPL(stype, vtype, prefix, postfix)      \
+    inline vtype vmul(const vtype &a, const vtype &b) \
+    {                                                 \
+        return prefix##_##postfix(a, b);              \
+    }
+
+VMUL_IMPL(uint8x8_t, uint8x8_t, vmul, u8)
+VMUL_IMPL(int8x8_t, int8x8_t, vmul, s8)
+VMUL_IMPL(uint16x4_t, uint16x4_t, vmul, u16)
+VMUL_IMPL(int16x4_t, int16x4_t, vmul, s16)
+VMUL_IMPL(uint32x2_t, uint32x2_t, vmul, u32)
+VMUL_IMPL(int32x2_t, int32x2_t, vmul, s32)
+VMUL_IMPL(float32x2_t, float32x2_t, vmul, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMUL_IMPL(float16_t, float16x4_t, vmul, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VMUL_IMPL(uint8_t, uint8x16_t, vmulq, u8)
+VMUL_IMPL(int8_t, int8x16_t, vmulq, s8)
+VMUL_IMPL(uint16_t, uint16x8_t, vmulq, u16)
+VMUL_IMPL(int16_t, int16x8_t, vmulq, s16)
+VMUL_IMPL(uint32_t, uint32x4_t, vmulq, u32)
+VMUL_IMPL(int32_t, int32x4_t, vmulq, s32)
+VMUL_IMPL(float32x4_t, float32x4_t, vmulq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMUL_IMPL(float16_t, float16x8_t, vmulq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VMUL_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_MUL_H */
diff --git a/src/core/NEON/wrapper/intrinsics/neg.h b/src/core/NEON/wrapper/intrinsics/neg.h
new file mode 100644
index 0000000..5e45566
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/neg.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_NEG_H
+#define ARM_COMPUTE_WRAPPER_NEG_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VNEG_IMPL(vtype, prefix, postfix) \
+    inline vtype vneg(const vtype &a)     \
+    {                                     \
+        return prefix##_##postfix(a);     \
+    }
+
+VNEG_IMPL(int8x8_t, vneg, s8)
+VNEG_IMPL(int16x4_t, vneg, s16)
+VNEG_IMPL(int32x2_t, vneg, s32)
+VNEG_IMPL(float32x2_t, vneg, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VNEG_IMPL(float16x4_t, vneg, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VNEG_IMPL(int8x16_t, vnegq, s8)
+VNEG_IMPL(int16x8_t, vnegq, s16)
+VNEG_IMPL(int32x4_t, vnegq, s32)
+VNEG_IMPL(float32x4_t, vnegq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VNEG_IMPL(float16x8_t, vnegq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VNEG_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_NEG_H */
diff --git a/src/core/NEON/wrapper/intrinsics/not.h b/src/core/NEON/wrapper/intrinsics/not.h
new file mode 100644
index 0000000..5853e84
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/not.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_NOT_H
+#define ARM_COMPUTE_WRAPPER_NOT_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VNOT_IMPL(stype, vtype, prefix, postfix) \
+    inline vtype vnot(const vtype &a)            \
+    {                                            \
+        return prefix##_##postfix(a);            \
+    }
+
+VNOT_IMPL(uint8_t, uint8x8_t, vmvn, u8)
+VNOT_IMPL(int8_t, int8x8_t, vmvn, s8)
+VNOT_IMPL(uint16_t, uint16x4_t, vmvn, u16)
+VNOT_IMPL(int16_t, int16x4_t, vmvn, s16)
+VNOT_IMPL(uint32_t, uint32x2_t, vmvn, u32)
+VNOT_IMPL(int32_t, int32x2_t, vmvn, s32)
+VNOT_IMPL(float32x2_t, float32x2_t, vinv, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VNOT_IMPL(float16x4_t, float16x4_t, vinv, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VNOT_IMPL(uint8_t, uint8x16_t, vmvnq, u8)
+VNOT_IMPL(int8_t, int8x16_t, vmvnq, s8)
+VNOT_IMPL(uint16_t, uint16x8_t, vmvnq, u16)
+VNOT_IMPL(int16_t, int16x8_t, vmvnq, s16)
+VNOT_IMPL(uint32_t, uint32x4_t, vmvnq, u32)
+VNOT_IMPL(int32_t, int32x4_t, vmvnq, s32)
+VNOT_IMPL(float32x4_t, float32x4_t, vinvq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VNOT_IMPL(float16x8_t, float16x8_t, vinvq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VNOT_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_NOT_H */
diff --git a/src/core/NEON/wrapper/intrinsics/orr.h b/src/core/NEON/wrapper/intrinsics/orr.h
new file mode 100644
index 0000000..cc83e95
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/orr.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_ORR_H
+#define ARM_COMPUTE_WRAPPER_ORR_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VORR_IMPL(stype, vtype, prefix, postfix)      \
+    inline vtype vorr(const vtype &a, const vtype &b) \
+    {                                                 \
+        return prefix##_##postfix(a, b);              \
+    }
+
+VORR_IMPL(uint8_t, uint8x8_t, vorr, u8)
+VORR_IMPL(int8_t, int8x8_t, vorr, s8)
+VORR_IMPL(uint16_t, uint16x4_t, vorr, u16)
+VORR_IMPL(int16_t, int16x4_t, vorr, s16)
+VORR_IMPL(uint32_t, uint32x2_t, vorr, u32)
+VORR_IMPL(int32_t, int32x2_t, vorr, s32)
+VORR_IMPL(uint64_t, uint64x1_t, vorr, u64)
+VORR_IMPL(int64_t, int64x1_t, vorr, s64)
+
+VORR_IMPL(uint8_t, uint8x16_t, vorrq, u8)
+VORR_IMPL(int8_t, int8x16_t, vorrq, s8)
+VORR_IMPL(uint16_t, uint16x8_t, vorrq, u16)
+VORR_IMPL(int16_t, int16x8_t, vorrq, s16)
+VORR_IMPL(uint32_t, uint32x4_t, vorrq, u32)
+VORR_IMPL(int32_t, int32x4_t, vorrq, s32)
+VORR_IMPL(uint64_t, uint64x2_t, vorrq, u64)
+VORR_IMPL(int64_t, int64x2_t, vorrq, s64)
+
+#undef VORR_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_ORR_H */
diff --git a/src/core/NEON/wrapper/intrinsics/pmax.h b/src/core/NEON/wrapper/intrinsics/pmax.h
new file mode 100644
index 0000000..cd2b2d1
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/pmax.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_PMAX_H
+#define ARM_COMPUTE_WRAPPER_PMAX_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VPMAX_IMPL(stype, vtype, prefix, postfix)      \
+    inline vtype vpmax(const vtype &a, const vtype &b) \
+    {                                                  \
+        return prefix##_##postfix(a, b);               \
+    }
+
+VPMAX_IMPL(uint8_t, uint8x8_t, vpmax, u8)
+VPMAX_IMPL(int8_t, int8x8_t, vpmax, s8)
+VPMAX_IMPL(uint16_t, uint16x4_t, vpmax, u16)
+VPMAX_IMPL(int16_t, int16x4_t, vpmax, s16)
+VPMAX_IMPL(uint32_t, uint32x2_t, vpmax, u32)
+VPMAX_IMPL(int32_t, int32x2_t, vpmax, s32)
+VPMAX_IMPL(float, float32x2_t, vpmax, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VPMAX_IMPL(float16_t, float16x4_t, vpmax, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VPMAX_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_PMAX_H */
diff --git a/src/core/NEON/wrapper/intrinsics/pmin.h b/src/core/NEON/wrapper/intrinsics/pmin.h
new file mode 100644
index 0000000..59b6be6
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/pmin.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_PMIN_H
+#define ARM_COMPUTE_WRAPPER_PMIN_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VPMIN_IMPL(stype, vtype, prefix, postfix)      \
+    inline vtype vpmin(const vtype &a, const vtype &b) \
+    {                                                  \
+        return prefix##_##postfix(a, b);               \
+    }
+
+VPMIN_IMPL(uint8_t, uint8x8_t, vpmin, u8)
+VPMIN_IMPL(int8_t, int8x8_t, vpmin, s8)
+VPMIN_IMPL(uint16_t, uint16x4_t, vpmin, u16)
+VPMIN_IMPL(int16_t, int16x4_t, vpmin, s16)
+VPMIN_IMPL(uint32_t, uint32x2_t, vpmin, u32)
+VPMIN_IMPL(int32_t, int32x2_t, vpmin, s32)
+VPMIN_IMPL(float, float32x2_t, vpmin, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VPMIN_IMPL(float16_t, float16x4_t, vpmin, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VPMIN_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_PMIN_H */
diff --git a/src/core/NEON/wrapper/intrinsics/pow.h b/src/core/NEON/wrapper/intrinsics/pow.h
new file mode 100644
index 0000000..61f834e
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/pow.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_POW_H
+#define ARM_COMPUTE_WRAPPER_POW_H
+
+#include "src/core/NEON/NEMath.h"
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VPOW_IMPL(vtype, prefix, postfix)             \
+    inline vtype vpow(const vtype &a, const vtype &b) \
+    {                                                 \
+        return prefix##_##postfix(a, b);              \
+    }
+
+VPOW_IMPL(float32x4_t, vpowq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VPOW_IMPL(float16x8_t, vpowq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VPOW_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_POW_H */
diff --git a/src/core/NEON/wrapper/intrinsics/qmov.h b/src/core/NEON/wrapper/intrinsics/qmov.h
new file mode 100644
index 0000000..167f3cf
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/qmov.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_QMOV_H
+#define ARM_COMPUTE_WRAPPER_QMOV_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+template <typename T>
+inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x8_t>::type
+vqmov(const int16x8_t &a)
+{
+    return vqmovun_s16(a);
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x8_t>::type
+vqmov(const int16x8_t &a)
+{
+    return vqmovn_s16(a);
+}
+
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_QMOV_H */
diff --git a/src/core/NEON/wrapper/intrinsics/qmovun.h b/src/core/NEON/wrapper/intrinsics/qmovun.h
new file mode 100644
index 0000000..f823ddb
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/qmovun.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_QMOVUN_H
+#define ARM_COMPUTE_WRAPPER_QMOVUN_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VQMOVUN_IMPL(dtype, vtype, prefix, postfix) \
+    inline dtype vqmovun(const vtype &a)            \
+    {                                               \
+        return prefix##_##postfix(a);               \
+    }
+
+VQMOVUN_IMPL(uint32x2_t, int64x2_t, vqmovun, s64)
+VQMOVUN_IMPL(uint16x4_t, int32x4_t, vqmovun, s32)
+VQMOVUN_IMPL(uint8x8_t, int16x8_t, vqmovun, s16)
+
+#undef VQMOVUN_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_QMOVUN_H */
diff --git a/src/core/NEON/wrapper/intrinsics/reinterpret.h b/src/core/NEON/wrapper/intrinsics/reinterpret.h
new file mode 100644
index 0000000..0c26cd9
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/reinterpret.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_REINTERPRET_H
+#define ARM_COMPUTE_WRAPPER_REINTERPRET_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VREINTERPRET_IMPL(ptype, vtype, prefix, postfix1, postfix2) \
+    inline ptype vreinterpret(const vtype &a)                       \
+    {                                                               \
+        return prefix##_##postfix1##_##postfix2(a);                 \
+    }                                                               \
+    \
+    inline ptype vreinterpret(const ptype &a)                       \
+    {                                                               \
+        return a;                                                   \
+    }
+
+VREINTERPRET_IMPL(int16x4_t, uint16x4_t, vreinterpret, s16, u16)
+
+VREINTERPRET_IMPL(int32x4_t, uint32x4_t, vreinterpretq, s32, u32)
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_REINTERPRET_H */
diff --git a/src/core/NEON/wrapper/intrinsics/rev64.h b/src/core/NEON/wrapper/intrinsics/rev64.h
new file mode 100644
index 0000000..0f0139c
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/rev64.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_REV64_H
+#define ARM_COMPUTE_WRAPPER_REV64_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VREV64_IMPL(vtype, prefix, postfix) \
+    inline vtype vrev64(const vtype &a)     \
+    {                                       \
+        return prefix##_##postfix(a);       \
+    }
+
+VREV64_IMPL(uint8x8_t, vrev64, u8)
+VREV64_IMPL(int8x8_t, vrev64, s8)
+VREV64_IMPL(uint16x4_t, vrev64, u16)
+VREV64_IMPL(int16x4_t, vrev64, s16)
+VREV64_IMPL(uint32x2_t, vrev64, u32)
+VREV64_IMPL(int32x2_t, vrev64, s32)
+VREV64_IMPL(float32x2_t, vrev64, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VREV64_IMPL(float16x4_t, vrev64, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VREV64_IMPL(uint8x16_t, vrev64q, u8)
+VREV64_IMPL(int8x16_t, vrev64q, s8)
+VREV64_IMPL(uint16x8_t, vrev64q, u16)
+VREV64_IMPL(int16x8_t, vrev64q, s16)
+VREV64_IMPL(uint32x4_t, vrev64q, u32)
+VREV64_IMPL(int32x4_t, vrev64q, s32)
+VREV64_IMPL(float32x4_t, vrev64q, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VREV64_IMPL(float16x8_t, vrev64q, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VREV64_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_REV64_H */
diff --git a/src/core/NEON/wrapper/intrinsics/round.h b/src/core/NEON/wrapper/intrinsics/round.h
new file mode 100644
index 0000000..d23feb6
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/round.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_ROUND_H
+#define ARM_COMPUTE_WRAPPER_ROUND_H
+
+#include "src/core/NEON/NEMath.h"
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VROUNDQ_IMPL(vtype, postfix)     \
+    inline vtype vround(const vtype &a)  \
+    {                                    \
+        return vroundq_rte_##postfix(a); \
+    }
+
+#define VROUNDQ_IMPL_INT(vtype, postfix)    \
+    inline vtype vround(const vtype &a)     \
+    {                                       \
+        ARM_COMPUTE_UNUSED(a);              \
+        ARM_COMPUTE_ERROR("Not supported"); \
+    }
+
+VROUNDQ_IMPL(float32x4_t, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VROUNDQ_IMPL(float16x8_t, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VROUNDQ_IMPL_INT(int32x4_t, s32)
+#undef VROUNDQ_IMPL
+
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_ROUND_H */
diff --git a/src/core/NEON/wrapper/intrinsics/setlane.h b/src/core/NEON/wrapper/intrinsics/setlane.h
new file mode 100644
index 0000000..197eeda
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/setlane.h
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_SET_LANE_H
+#define ARM_COMPUTE_WRAPPER_SET_LANE_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VSETLANE_IMPL_8(stype, atype, vtype, postfix)                                     \
+    inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
+    {                                                                                     \
+        switch(lane)                                                                      \
+        {                                                                                 \
+            case 0:                                                                       \
+                return vset_lane_##postfix(value, vector, 0);                             \
+            case 1:                                                                       \
+                return vset_lane_##postfix(value, vector, 1);                             \
+            case 2:                                                                       \
+                return vset_lane_##postfix(value, vector, 2);                             \
+            case 3:                                                                       \
+                return vset_lane_##postfix(value, vector, 3);                             \
+            case 4:                                                                       \
+                return vset_lane_##postfix(value, vector, 4);                             \
+            case 5:                                                                       \
+                return vset_lane_##postfix(value, vector, 5);                             \
+            case 6:                                                                       \
+                return vset_lane_##postfix(value, vector, 6);                             \
+            case 7:                                                                       \
+                return vset_lane_##postfix(value, vector, 7);                             \
+            default:                                                                      \
+                ARM_COMPUTE_ERROR("Invalid lane");                                        \
+        }                                                                                 \
+    }
+
+#define VSETLANE_IMPL_4(stype, atype, vtype, postfix)                                     \
+    inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
+    {                                                                                     \
+        switch(lane)                                                                      \
+        {                                                                                 \
+            case 0:                                                                       \
+                return vset_lane_##postfix(value, vector, 0);                             \
+            case 1:                                                                       \
+                return vset_lane_##postfix(value, vector, 1);                             \
+            case 2:                                                                       \
+                return vset_lane_##postfix(value, vector, 2);                             \
+            case 3:                                                                       \
+                return vset_lane_##postfix(value, vector, 3);                             \
+            default:                                                                      \
+                ARM_COMPUTE_ERROR("Invalid lane");                                        \
+        }                                                                                 \
+    }
+
+#define VSETLANE_IMPL_2(stype, atype, vtype, postfix)                                     \
+    inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
+    {                                                                                     \
+        switch(lane)                                                                      \
+        {                                                                                 \
+            case 0:                                                                       \
+                return vset_lane_##postfix(value, vector, 0);                             \
+            case 1:                                                                       \
+                return vset_lane_##postfix(value, vector, 1);                             \
+            default:                                                                      \
+                ARM_COMPUTE_ERROR("Invalid lane");                                        \
+        }                                                                                 \
+    }
+
+VSETLANE_IMPL_8(uint8x8_t, uint8_t, uint8x8_t, u8)
+VSETLANE_IMPL_8(int8x8_t, int8_t, int8x8_t, s8)
+VSETLANE_IMPL_4(uint16x4_t, uint16_t, uint16x4_t, u16)
+VSETLANE_IMPL_4(int16x4_t, int16_t, int16x4_t, s16)
+VSETLANE_IMPL_2(uint32x2_t, uint32_t, uint32x2_t, u32)
+VSETLANE_IMPL_2(int32x2_t, int32_t, int32x2_t, s32)
+VSETLANE_IMPL_2(float32x2_t, float, float32x2_t, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VSETLANE_IMPL_4(float16x4_t, float16_t, float16x4_t, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#define VSETQLANE_IMPL_16(stype, atype, vtype, postfix)                                   \
+    inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
+    {                                                                                     \
+        switch(lane)                                                                      \
+        {                                                                                 \
+            case 0:                                                                       \
+                return vsetq_lane_##postfix(value, vector, 0);                            \
+            case 1:                                                                       \
+                return vsetq_lane_##postfix(value, vector, 1);                            \
+            case 2:                                                                       \
+                return vsetq_lane_##postfix(value, vector, 2);                            \
+            case 3:                                                                       \
+                return vsetq_lane_##postfix(value, vector, 3);                            \
+            case 4:                                                                       \
+                return vsetq_lane_##postfix(value, vector, 4);                            \
+            case 5:                                                                       \
+                return vsetq_lane_##postfix(value, vector, 5);                            \
+            case 6:                                                                       \
+                return vsetq_lane_##postfix(value, vector, 6);                            \
+            case 7:                                                                       \
+                return vsetq_lane_##postfix(value, vector, 7);                            \
+            case 8:                                                                       \
+                return vsetq_lane_##postfix(value, vector, 8);                            \
+            case 9:                                                                       \
+                return vsetq_lane_##postfix(value, vector, 9);                            \
+            case 10:                                                                      \
+                return vsetq_lane_##postfix(value, vector, 10);                           \
+            case 11:                                                                      \
+                return vsetq_lane_##postfix(value, vector, 11);                           \
+            case 12:                                                                      \
+                return vsetq_lane_##postfix(value, vector, 12);                           \
+            case 13:                                                                      \
+                return vsetq_lane_##postfix(value, vector, 13);                           \
+            case 14:                                                                      \
+                return vsetq_lane_##postfix(value, vector, 14);                           \
+            case 15:                                                                      \
+                return vsetq_lane_##postfix(value, vector, 15);                           \
+            default:                                                                      \
+                ARM_COMPUTE_ERROR("Invalid lane");                                        \
+        }                                                                                 \
+    }
+
+#define VSETQLANE_IMPL_8(stype, atype, vtype, postfix)                                    \
+    inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
+    {                                                                                     \
+        switch(lane)                                                                      \
+        {                                                                                 \
+            case 0:                                                                       \
+                return vsetq_lane_##postfix(value, vector, 0);                            \
+            case 1:                                                                       \
+                return vsetq_lane_##postfix(value, vector, 1);                            \
+            case 2:                                                                       \
+                return vsetq_lane_##postfix(value, vector, 2);                            \
+            case 3:                                                                       \
+                return vsetq_lane_##postfix(value, vector, 3);                            \
+            case 4:                                                                       \
+                return vsetq_lane_##postfix(value, vector, 4);                            \
+            case 5:                                                                       \
+                return vsetq_lane_##postfix(value, vector, 5);                            \
+            case 6:                                                                       \
+                return vsetq_lane_##postfix(value, vector, 6);                            \
+            case 7:                                                                       \
+                return vsetq_lane_##postfix(value, vector, 7);                            \
+            default:                                                                      \
+                ARM_COMPUTE_ERROR("Invalid lane");                                        \
+        }                                                                                 \
+    }
+
+#define VSETQLANE_IMPL_4(stype, atype, vtype, postfix)                                    \
+    inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
+    {                                                                                     \
+        switch(lane)                                                                      \
+        {                                                                                 \
+            case 0:                                                                       \
+                return vsetq_lane_##postfix(value, vector, 0);                            \
+            case 1:                                                                       \
+                return vsetq_lane_##postfix(value, vector, 1);                            \
+            case 2:                                                                       \
+                return vsetq_lane_##postfix(value, vector, 2);                            \
+            case 3:                                                                       \
+                return vsetq_lane_##postfix(value, vector, 3);                            \
+            default:                                                                      \
+                ARM_COMPUTE_ERROR("Invalid lane");                                        \
+        }                                                                                 \
+    }
+
+VSETQLANE_IMPL_16(uint8x16_t, uint8_t, uint8x16_t, u8)
+VSETQLANE_IMPL_16(int8x16_t, int8_t, int8x16_t, s8)
+VSETQLANE_IMPL_8(uint16x8_t, uint16_t, uint16x8_t, u16)
+VSETQLANE_IMPL_8(int16x8_t, int16_t, int16x8_t, s16)
+VSETQLANE_IMPL_4(uint32x4_t, uint32_t, uint32x4_t, u32)
+VSETQLANE_IMPL_4(int32x4_t, int32_t, int32x4_t, s32)
+VSETQLANE_IMPL_4(float32x4_t, float, float32x4_t, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VSETQLANE_IMPL_8(float16x8_t, float16_t, float16x8_t, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VSETLANE_IMPL_8
+#undef VSETLANE_IMPL_4
+#undef VSETLANE_IMPL_2
+
+#undef VSETQLANE_IMPL_16
+#undef VSETQLANE_IMPL_8
+#undef VSETQLANE_IMPL_4
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_SET_LANE_H */
diff --git a/src/core/NEON/wrapper/intrinsics/sin.h b/src/core/NEON/wrapper/intrinsics/sin.h
new file mode 100644
index 0000000..03c2813
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/sin.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_SIN_H
+#define ARM_COMPUTE_WRAPPER_SIN_H
+
+#include "src/core/NEON/NEMath.h"
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VSIN_IMPL(vtype, prefix, postfix) \
+    inline vtype vsin(const vtype &a)     \
+    {                                     \
+        return prefix##_##postfix(a);     \
+    }
+
+#define VSIN_IMPL_INT(vtype, prefix, postfix) \
+    inline vtype vsin(const vtype &a)         \
+    {                                         \
+        ARM_COMPUTE_UNUSED(a);                \
+        ARM_COMPUTE_ERROR("Not supported");   \
+    }
+
+VSIN_IMPL(float32x4_t, vsinq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VSIN_IMPL(float16x8_t, vsinq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VSIN_IMPL_INT(int32x4_t, vsinq, s32)
+
+#undef vsub_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_SUB_H */
\ No newline at end of file
diff --git a/src/core/NEON/wrapper/intrinsics/store.h b/src/core/NEON/wrapper/intrinsics/store.h
new file mode 100644
index 0000000..6dda432
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/store.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_STORE_H
+#define ARM_COMPUTE_WRAPPER_STORE_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VSTORE_IMPL(stype, vtype, prefix, postfix) \
+    inline void vstore(stype *ptr, vtype val)      \
+    {                                              \
+        prefix##_##postfix(ptr, val);              \
+    }
+
+VSTORE_IMPL(uint8_t, uint8x8_t, vst1, u8)
+VSTORE_IMPL(uint8_t, uint8x8x2_t, vst2, u8)
+VSTORE_IMPL(int8_t, int8x8_t, vst1, s8)
+VSTORE_IMPL(int8_t, int8x8x2_t, vst2, s8)
+VSTORE_IMPL(uint16_t, uint16x4_t, vst1, u16)
+VSTORE_IMPL(int16_t, int16x4_t, vst1, s16)
+VSTORE_IMPL(uint32_t, uint32x2_t, vst1, u32)
+VSTORE_IMPL(int32_t, int32x2_t, vst1, s32)
+//VSTORE_IMPL(uint64_t, 1, vst1, u64)
+//VSTORE_IMPL(int64_t, 1, vst1, s64)
+VSTORE_IMPL(float, float32x2_t, vst1, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VSTORE_IMPL(float16_t, float16x4_t, vst1, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VSTORE_IMPL(uint8_t, uint8x16_t, vst1q, u8)
+VSTORE_IMPL(int8_t, int8x16_t, vst1q, s8)
+VSTORE_IMPL(uint16_t, uint16x8_t, vst1q, u16)
+VSTORE_IMPL(int16_t, int16x8_t, vst1q, s16)
+VSTORE_IMPL(uint32_t, uint32x4_t, vst1q, u32)
+VSTORE_IMPL(int32_t, int32x4_t, vst1q, s32)
+//VSTORE_IMPL(uint64_t, 2, vst1q, u64)
+//VSTORE_IMPL(int64_t, 2, vst1q, s64)
+VSTORE_IMPL(float, float32x4_t, vst1q, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VSTORE_IMPL(float16_t, float16x8_t, vst1q, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VSTORE_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_STORE_H */
diff --git a/src/core/NEON/wrapper/intrinsics/sub.h b/src/core/NEON/wrapper/intrinsics/sub.h
new file mode 100644
index 0000000..475986d
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/sub.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_SUB_H
+#define ARM_COMPUTE_WRAPPER_SUB_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VSUB_IMPL(stype, vtype, prefix, postfix)      \
+    inline vtype vsub(const vtype &a, const vtype &b) \
+    {                                                 \
+        return prefix##_##postfix(a, b);              \
+    }
+
+VSUB_IMPL(uint8x8_t, uint8x8_t, vsub, u8)
+VSUB_IMPL(int8x8_t, int8x8_t, vsub, s8)
+VSUB_IMPL(uint16x4_t, uint16x4_t, vsub, u16)
+VSUB_IMPL(int16x4_t, int16x4_t, vsub, s16)
+VSUB_IMPL(uint32x2_t, uint32x2_t, vsub, u32)
+VSUB_IMPL(int32x2_t, int32x2_t, vsub, s32)
+VSUB_IMPL(uint64x1_t, uint64x1_t, vsub, u64)
+VSUB_IMPL(int64x1_t, int64x1_t, vsub, s64)
+VSUB_IMPL(float32x2_t, float32x2_t, vsub, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VSUB_IMPL(float16x4_t, float16x4_t, vsub, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VSUB_IMPL(uint8x16_t, uint8x16_t, vsubq, u8)
+VSUB_IMPL(int8x16_t, int8x16_t, vsubq, s8)
+VSUB_IMPL(uint16x8_t, uint16x8_t, vsubq, u16)
+VSUB_IMPL(int16x8_t, int16x8_t, vsubq, s16)
+VSUB_IMPL(uint32x4_t, uint32x4_t, vsubq, u32)
+VSUB_IMPL(int32x4_t, int32x4_t, vsubq, s32)
+VSUB_IMPL(uint64x2_t, uint64x2_t, vsubq, u64)
+VSUB_IMPL(int64x2_t, int64x2_t, vsubq, s64)
+VSUB_IMPL(float32x4_t, float32x4_t, vsubq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VSUB_IMPL(float16x8_t, float16x8_t, vsubq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VSUB_IMPL
+
+// VQSUB: Vector saturating sub (No notion of saturation for floating point)
+#define VQSUB_IMPL(stype, vtype, prefix, postfix)      \
+    inline vtype vqsub(const vtype &a, const vtype &b) \
+    {                                                  \
+        return prefix##_##postfix(a, b);               \
+    }
+
+VQSUB_IMPL(uint8x8_t, uint8x8_t, vqsub, u8)
+VQSUB_IMPL(int8x8_t, int8x8_t, vqsub, s8)
+VQSUB_IMPL(uint16x4_t, uint16x4_t, vqsub, u16)
+VQSUB_IMPL(int16x4_t, int16x4_t, vqsub, s16)
+VQSUB_IMPL(uint32x2_t, uint32x2_t, vqsub, u32)
+VQSUB_IMPL(int32x2_t, int32x2_t, vqsub, s32)
+VQSUB_IMPL(uint64x1_t, uint64x1_t, vqsub, u64)
+VQSUB_IMPL(int64x1_t, int64x1_t, vqsub, s64)
+VQSUB_IMPL(float32x2_t, float32x2_t, vsub, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VQSUB_IMPL(float16x4_t, float16x4_t, vsub, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VQSUB_IMPL(uint8x16_t, uint8x16_t, vqsubq, u8)
+VQSUB_IMPL(int8x16_t, int8x16_t, vqsubq, s8)
+VQSUB_IMPL(uint16x8_t, uint16x8_t, vqsubq, u16)
+VQSUB_IMPL(int16x8_t, int16x8_t, vqsubq, s16)
+VQSUB_IMPL(uint32x4_t, uint32x4_t, vqsubq, u32)
+VQSUB_IMPL(int32x4_t, int32x4_t, vqsubq, s32)
+VQSUB_IMPL(uint64x2_t, uint64x2_t, vqsubq, u64)
+VQSUB_IMPL(int64x2_t, int64x2_t, vqsubq, s64)
+VQSUB_IMPL(float32x4_t, float32x4_t, vsubq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VQSUB_IMPL(float16x8_t, float16x8_t, vsubq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#undef VQSUB_IMPL
+
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_SUB_H */
diff --git a/src/core/NEON/wrapper/intrinsics/tanh.h b/src/core/NEON/wrapper/intrinsics/tanh.h
new file mode 100644
index 0000000..daeaf19
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/tanh.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_TANH_H
+#define ARM_COMPUTE_WRAPPER_TANH_H
+
+#include "src/core/NEON/NEMath.h"
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VTANH_IMPL(vtype, prefix, postfix) \
+    inline vtype vtanh(const vtype &a)     \
+    {                                      \
+        return prefix##_##postfix(a);      \
+    }
+
+VTANH_IMPL(float32x4_t, vtanhq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VTANH_IMPL(float16x8_t, vtanhq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#undef VTANH_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_TANH_H */
diff --git a/src/core/NEON/wrapper/intrinsics/tbl.h b/src/core/NEON/wrapper/intrinsics/tbl.h
new file mode 100644
index 0000000..05e6c1f
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/tbl.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_TBL_H
+#define ARM_COMPUTE_WRAPPER_TBL_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VTBL_IMPL(stype, vtype, prefix, postfix)      \
+    inline vtype vtbl(const stype &a, const vtype &b) \
+    {                                                 \
+        return prefix##_##postfix(a, b);              \
+    }
+
+VTBL_IMPL(uint8x8x2_t, uint8x8_t, vtbl2, u8)
+VTBL_IMPL(int8x8x2_t, int8x8_t, vtbl2, s8)
+
+#undef VTBL_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_TBL_H */
diff --git a/src/core/NEON/wrapper/scalar/add.h b/src/core/NEON/wrapper/scalar/add.h
new file mode 100644
index 0000000..642d926
--- /dev/null
+++ b/src/core/NEON/wrapper/scalar/add.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_SCALAR_ADD_H
+#define ARM_COMPUTE_WRAPPER_SCALAR_ADD_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+inline uint8_t add_sat(const uint8_t &a, const uint8_t &b)
+{
+    const uint8x8_t va = { a, 0, 0, 0, 0, 0, 0, 0 };
+    const uint8x8_t vb = { b, 0, 0, 0, 0, 0, 0, 0 };
+    return vget_lane_u8(vqadd_u8(va, vb), 0);
+}
+
+inline int16_t add_sat(const int16_t &a, const int16_t &b)
+{
+    const int16x4_t va = { a, 0, 0, 0 };
+    const int16x4_t vb = { b, 0, 0, 0 };
+    return vget_lane_s16(vqadd_s16(va, vb), 0);
+}
+
+inline int32_t add_sat(const int32_t &a, const int32_t &b)
+{
+    const int32x2_t va = { a, 0 };
+    const int32x2_t vb = { b, 0 };
+    return vget_lane_s32(vqadd_s32(va, vb), 0);
+}
+
+inline float add_sat(const float &a, const float &b)
+{
+    // No notion of saturation exists in floating point
+    return a + b;
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+inline float16_t add_sat(const float16_t &a, const float16_t &b)
+{
+    // No notion of saturation exists in floating point
+    return a + b;
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_SCALAR_ADD_H */
diff --git a/src/core/NEON/wrapper/scalar/scalar.h b/src/core/NEON/wrapper/scalar/scalar.h
new file mode 100644
index 0000000..8be37e5
--- /dev/null
+++ b/src/core/NEON/wrapper/scalar/scalar.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_SCALAR_H
+#define ARM_COMPUTE_WRAPPER_SCALAR_H
+
+#include "src/core/NEON/wrapper/scalar/add.h"
+#include "src/core/NEON/wrapper/scalar/sub.h"
+
+#endif /* ARM_COMPUTE_WRAPPER_SCALAR_H */
diff --git a/src/core/NEON/wrapper/scalar/sub.h b/src/core/NEON/wrapper/scalar/sub.h
new file mode 100644
index 0000000..1fe51d7
--- /dev/null
+++ b/src/core/NEON/wrapper/scalar/sub.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_SCALAR_SUB_H
+#define ARM_COMPUTE_WRAPPER_SCALAR_SUB_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+inline uint8_t sub_sat(const uint8_t &a, const uint8_t &b)
+{
+    const uint8x8_t va = { a, 0, 0, 0, 0, 0, 0, 0 };
+    const uint8x8_t vb = { b, 0, 0, 0, 0, 0, 0, 0 };
+    return vget_lane_u8(vqsub_u8(va, vb), 0);
+}
+
+inline int16_t sub_sat(const int16_t &a, const int16_t &b)
+{
+    const int16x4_t va = { a, 0, 0, 0 };
+    const int16x4_t vb = { b, 0, 0, 0 };
+    return vget_lane_s16(vqsub_s16(va, vb), 0);
+}
+
+inline int32_t sub_sat(const int32_t &a, const int32_t &b)
+{
+    const int32x2_t va = { a, 0 };
+    const int32x2_t vb = { b, 0 };
+    return vget_lane_s32(vqsub_s32(va, vb), 0);
+}
+
+inline float sub_sat(const float &a, const float &b)
+{
+    // No notion of saturation exists in floating point
+    return a - b;
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+inline float16_t sub_sat(const float16_t &a, const float16_t &b)
+{
+    // No notion of saturation exists in floating point
+    return a - b;
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_SCALAR_SUB_H */
diff --git a/src/core/NEON/wrapper/traits.h b/src/core/NEON/wrapper/traits.h
new file mode 100644
index 0000000..eafbeef
--- /dev/null
+++ b/src/core/NEON/wrapper/traits.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_TRAITS_H
+#define ARM_COMPUTE_WRAPPER_TRAITS_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+namespace traits
+{
+// *INDENT-OFF*
+// clang-format off
+
+/** 64-bit vector tag */
+struct vector_64_tag {};
+/** 128-bit vector tag */
+struct vector_128_tag {};
+
+/** Create the appropriate NEON vector given its type and size in terms of elements */
+template <typename T, int S> struct neon_vector;
+
+// Specializations
+#ifndef DOXYGEN_SKIP_THIS
+template <> struct neon_vector<uint8_t, 8>{ using scalar_type = uint8_t; using type = uint8x8_t; using tag_type = vector_64_tag; };
+template <> struct neon_vector<int8_t, 8>{ using scalar_type = int8_t; using type = int8x8_t; using tag_type = vector_64_tag; };
+template <> struct neon_vector<uint8_t, 16>{ using scalar_type = uint8_t; using type = uint8x16_t; using tag_type = vector_128_tag; };
+template <> struct neon_vector<int8_t, 16>{ using scalar_type = int8_t; using type = int8x16_t; using tag_type = vector_128_tag; };
+template <> struct neon_vector<uint16_t, 4>{ using scalar_type = uint16_t; using type = uint16x4_t; using tag_type = vector_64_tag; };
+template <> struct neon_vector<int16_t, 4>{ using scalar_type = int16_t; using type = int16x4_t; using tag_type = vector_64_tag; };
+template <> struct neon_vector<uint16_t, 8>{ using scalar_type = uint16_t; using type = uint16x8_t; using tag_type = vector_128_tag; };
+template <> struct neon_vector<uint16_t, 16>{ using scalar_type = uint16_t; using type = uint16x8x2_t; };
+template <> struct neon_vector<int16_t, 8>{ using scalar_type = int16_t; using type = int16x8_t; using tag_type = vector_128_tag; };
+template <> struct neon_vector<int16_t, 16>{ using scalar_type = int16_t; using type = int16x8x2_t; };
+template <> struct neon_vector<uint32_t, 2>{ using scalar_type = uint32_t; using type = uint32x2_t; using tag_type = vector_64_tag; };
+template <> struct neon_vector<int32_t, 2>{ using scalar_type = int32_t; using type = int32x2_t; using tag_type = vector_64_tag; };
+template <> struct neon_vector<uint32_t, 4>{ using scalar_type = uint32_t; using type = uint32x4_t; using tag_type = vector_128_tag; };
+template <> struct neon_vector<int32_t, 4>{ using scalar_type = int32_t; using type = int32x4_t; using tag_type = vector_128_tag; };
+template <> struct neon_vector<uint64_t, 1>{ using scalar_type = uint64_t;using type = uint64x1_t; using tag_type = vector_64_tag; };
+template <> struct neon_vector<int64_t, 1>{ using scalar_type = int64_t; using type = int64x1_t; using tag_type = vector_64_tag; };
+template <> struct neon_vector<uint64_t, 2>{ using scalar_type = uint64_t; using type = uint64x2_t; using tag_type = vector_128_tag; };
+template <> struct neon_vector<int64_t, 2>{ using scalar_type = int64_t; using type = int64x2_t; using tag_type = vector_128_tag; };
+template <> struct neon_vector<float_t, 2>{ using scalar_type = float_t; using type = float32x2_t; using tag_type = vector_64_tag; };
+template <> struct neon_vector<float_t, 4>{ using scalar_type = float_t; using type = float32x4_t; using tag_type = vector_128_tag; };
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <> struct neon_vector<float16_t, 4>{ using scalar_type = float16_t; using type = float16x4_t; using tag_type = vector_64_tag; };
+template <> struct neon_vector<float16_t, 8>{ using scalar_type = float16_t; using type = float16x8_t; using tag_type = vector_128_tag; };
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif /* DOXYGEN_SKIP_THIS */
+
+/**  Helper type template to get the type of a neon vector */
+template <typename T, int S> using neon_vector_t = typename neon_vector<T, S>::type;
+/**  Helper type template to get the tag type of a neon vector */
+template <typename T, int S> using neon_vector_tag_t = typename neon_vector<T, S>::tag_type;
+
+/** Vector bit-width enum class */
+enum class BitWidth
+{
+    W64,  /**< 64-bit width */
+    W128, /**< 128-bit width */
+};
+
+/** Create the appropriate NEON vector given its type and size in terms of bits */
+template <typename T, BitWidth BW> struct neon_bitvector;
+// Specializations
+#ifndef DOXYGEN_SKIP_THIS
+template <> struct neon_bitvector<uint8_t, BitWidth::W64>{ using type = uint8x8_t; using tag_type = vector_64_tag; };
+template <> struct neon_bitvector<int8_t, BitWidth::W64>{ using type = int8x8_t; using tag_type = vector_64_tag; };
+template <> struct neon_bitvector<uint8_t, BitWidth::W128>{ using type = uint8x16_t; using tag_type = vector_128_tag; };
+template <> struct neon_bitvector<int8_t, BitWidth::W128>{ using type = int8x16_t; using tag_type = vector_128_tag; };
+template <> struct neon_bitvector<uint16_t, BitWidth::W64>{ using type = uint16x4_t; using tag_type = vector_64_tag; };
+template <> struct neon_bitvector<int16_t, BitWidth::W64>{ using type = int16x4_t; using tag_type = vector_64_tag; };
+template <> struct neon_bitvector<uint16_t, BitWidth::W128>{ using type = uint16x8_t; using tag_type = vector_128_tag; };
+template <> struct neon_bitvector<int16_t, BitWidth::W128>{ using type = int16x8_t; using tag_type = vector_128_tag; };
+template <> struct neon_bitvector<uint32_t, BitWidth::W64>{ using type = uint32x2_t; using tag_type = vector_64_tag; };
+template <> struct neon_bitvector<int32_t, BitWidth::W64>{ using type = int32x2_t; using tag_type = vector_64_tag; };
+template <> struct neon_bitvector<uint32_t, BitWidth::W128>{ using type = uint32x4_t; using tag_type = vector_128_tag; };
+template <> struct neon_bitvector<int32_t, BitWidth::W128>{ using type = int32x4_t; using tag_type = vector_128_tag; };
+template <> struct neon_bitvector<uint64_t, BitWidth::W64>{ using type = uint64x1_t; using tag_type = vector_64_tag; };
+template <> struct neon_bitvector<int64_t, BitWidth::W64>{ using type = int64x1_t; using tag_type = vector_64_tag; };
+template <> struct neon_bitvector<uint64_t, BitWidth::W128>{ using type = uint64x2_t; using tag_type = vector_128_tag; };
+template <> struct neon_bitvector<int64_t, BitWidth::W128>{ using type = int64x2_t; using tag_type = vector_128_tag; };
+template <> struct neon_bitvector<float_t, BitWidth::W64>{ using type = float32x2_t; using tag_type = vector_64_tag; };
+template <> struct neon_bitvector<float_t, BitWidth::W128>{ using type = float32x4_t; using tag_type = vector_128_tag; };
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <> struct neon_bitvector<float16_t, BitWidth::W64>{ using type = float16x4_t; using tag_type = vector_64_tag; };
+template <> struct neon_bitvector<float16_t, BitWidth::W128>{ using type = float16x8_t; using tag_type = vector_128_tag; };
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif /* DOXYGEN_SKIP_THIS */
+
+/**  Helper type template to get the type of a neon vector */
+template <typename T, BitWidth BW> using neon_bitvector_t = typename neon_bitvector<T, BW>::type;
+/**  Helper type template to get the tag type of a neon vector */
+template <typename T, BitWidth BW> using neon_bitvector_tag_t = typename neon_bitvector<T, BW>::tag_type;
+
+/** Promote a type */
+template <typename T> struct promote { };
+template <> struct promote<uint8_t> { using type = uint16_t; };
+template <> struct promote<int8_t> { using type = int16_t; };
+template <> struct promote<uint16_t> { using type = uint32_t; };
+template <> struct promote<int16_t> { using type = int32_t; };
+template <> struct promote<uint32_t> { using type = uint64_t; };
+template <> struct promote<int32_t> { using type = int64_t; };
+template <> struct promote<float> { using type = float; };
+template <> struct promote<half> { using type = half; };
+
+/** Get promoted type */
+template <typename T>
+using promote_t = typename promote<T>::type;
+
+// clang-format on
+// *INDENT-ON*
+} // namespace traits
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_TRAITS_H */
diff --git a/src/core/NEON/wrapper/wrapper.h b/src/core/NEON/wrapper/wrapper.h
new file mode 100644
index 0000000..e5467e9
--- /dev/null
+++ b/src/core/NEON/wrapper/wrapper.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_H
+#define ARM_COMPUTE_WRAPPER_H
+
+// Traits
+#include "src/core/NEON/wrapper/traits.h"
+
+// Intrinsics Overloads
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include "src/core/NEON/wrapper/scalar/scalar.h"
+
+#endif /* ARM_COMPUTE_WRAPPER_H */