COMPMID-344 Updated doxygen Change-Id: I32f7b84daa560e460b77216add529c8fa8b327ae

commit: 6ff3b19ee6120edf015fad8caab2991faa3070af [log] [tgz]
author: Anthony Barbier <anthony.barbier@arm.com> Mon Sep 04 18:44:23 2017 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> Mon Sep 17 13:03:09 2018 +0100
tree: a7a6dcd16dfd56d79fa1b56a313caeebcc939b68
diff --git a/arm_compute/core/NEON/INEKernel.h b/arm_compute/core/NEON/INEKernel.h
new file mode 100644
index 0000000..3ac8164
--- /dev/null
+++ b/arm_compute/core/NEON/INEKernel.h

@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_INEKERNEL_H__
+#define __ARM_COMPUTE_INEKERNEL_H__
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+
+namespace arm_compute
+{
+using INEKernel = ICPPKernel;
+}
+#endif /*__ARM_COMPUTE_INEKERNEL_H__ */

diff --git a/arm_compute/core/NEON/INESimpleKernel.h b/arm_compute/core/NEON/INESimpleKernel.h
new file mode 100644
index 0000000..ca25532
--- /dev/null
+++ b/arm_compute/core/NEON/INESimpleKernel.h

@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_INESIMPLEKERNEL_H__
+#define __ARM_COMPUTE_INESIMPLEKERNEL_H__
+
+#include "arm_compute/core/CPP/ICPPSimpleKernel.h"
+
+namespace arm_compute
+{
+using INESimpleKernel = ICPPSimpleKernel;
+}
+#endif /*__ARM_COMPUTE_INESIMPLEKERNEL_H__ */

diff --git a/arm_compute/core/NEON/NEColorConvertHelper.inl b/arm_compute/core/NEON/NEColorConvertHelper.inl
new file mode 100644
index 0000000..9be7c8a
--- /dev/null
+++ b/arm_compute/core/NEON/NEColorConvertHelper.inl

@@ -0,0 +1,888 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IMultiImage.h"
+#include "arm_compute/core/Utils.h"
+
+#include <arm_neon.h>
+
+namespace
+{
+constexpr float red_coef_bt709    = 1.5748F;
+constexpr float green_coef_bt709  = -0.1873f;
+constexpr float green_coef2_bt709 = -0.4681f;
+constexpr float blue_coef_bt709   = 1.8556f;
+
+constexpr float rgb2yuv_bt709_kr = 0.2126f;
+constexpr float rgb2yuv_bt709_kb = 0.0722f;
+// K_g = 1 - K_r - K_b
+constexpr float rgb2yuv_bt709_kg = 0.7152f;
+// C_u = 1 / (2 * (1 - K_b))
+constexpr float rgb2yuv_bt709_cu = 0.5389f;
+// C_v = 1 / (2 * (1 - K_r))
+constexpr float rgb2yuv_bt709_cv = 0.6350f;
+
+inline void convert_uint8x16_to_float32x4x4(const uint8x16_t &in, float32x4x4_t &out)
+{
+    const auto tmp1 = vmovl_u8(vget_low_u8(in));
+    out.val[0]      = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp1)));
+    out.val[1]      = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp1)));
+    const auto tmp2 = vmovl_u8(vget_high_u8(in));
+    out.val[2]      = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp2)));
+    out.val[3]      = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp2)));
+}
+
+inline void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x3_t &in2, uint8x8x3_t &out)
+{
+    out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])),
+                                         vqmovn_u32(vcvtq_u32_f32(in2.val[0]))));
+    out.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[1])),
+                                         vqmovn_u32(vcvtq_u32_f32(in2.val[1]))));
+    out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])),
+                                         vqmovn_u32(vcvtq_u32_f32(in2.val[2]))));
+}
+
+inline void convert_float32x4x4_to_unit8x16(const float32x4x4_t &in, uint8x16_t &out)
+{
+    const auto low = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])),
+                                  vqmovn_u32(vcvtq_u32_f32(in.val[1])));
+    const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])),
+                                   vqmovn_u32(vcvtq_u32_f32(in.val[3])));
+    out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
+}
+
+inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &gvec, const float32x4_t &bvec,
+                                   float32x4_t &yvec, float32x4_t &uvec, float32x4_t &vvec)
+{
+    /*
+    Y'= 0.2126*R' + 0.7152*G' + 0.0722*B'
+    U'=-0.1146*R' - 0.3854*G' + 0.5000*B'
+    V'= 0.5000*R' - 0.4542*G' - 0.0458*B'
+    */
+    const auto c128 = vdupq_n_f32(128.f);
+
+    // Y = R * K_r + G * (1 - K_r - K_b) * B * K_b
+    yvec = vmulq_n_f32(rvec, rgb2yuv_bt709_kr);
+    yvec = vmlaq_n_f32(yvec, gvec, rgb2yuv_bt709_kg);
+    yvec = vmlaq_n_f32(yvec, bvec, rgb2yuv_bt709_kb);
+
+    // U = (B - Y) / (2 * (1 - K_b))
+    uvec = vsubq_f32(bvec, yvec);
+    uvec = vmlaq_n_f32(c128, uvec, rgb2yuv_bt709_cu);
+
+    // V = (R - Y) / (2 * (1 - K_r))
+    vvec = vsubq_f32(rvec, yvec);
+    vvec = vmlaq_n_f32(c128, vvec, rgb2yuv_bt709_cv);
+}
+
+inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uvec_val, const float32x4_t &yyvec_val,
+                                    float32x4_t vvec_val, unsigned char *output_ptr, const bool alpha)
+{
+    float32x4x3_t rgb1, rgb2;
+
+    // Compute: cb - 128 and cr - 128;
+    const auto c128 = vdupq_n_f32(128.f);
+    uvec_val        = vsubq_f32(uvec_val, c128);
+    vvec_val        = vsubq_f32(vvec_val, c128);
+
+    // Compute:
+    // r = 0.0000f*f_u + 1.5748f*f_v;
+    // g = 0.1873f*f_u - 0.4681f*f_v;
+    // b = 1.8556f*f_u + 0.0000f*f_v;
+    const auto red   = vmulq_n_f32(vvec_val, red_coef_bt709);
+    const auto blue  = vmulq_n_f32(uvec_val, blue_coef_bt709);
+    const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709),
+                                 vmulq_n_f32(vvec_val, green_coef2_bt709));
+
+    // Compute the final r,g,b values using y1 for the first texel and y2 for the second one.
+    // the result is stored in two float32x4x3_t which then are converted to one uint8x8x3_t
+    // and written back to memory using vst3 instruction
+
+    rgb1.val[0] = vaddq_f32(yvec_val, red);
+    rgb1.val[1] = vaddq_f32(yvec_val, green);
+    rgb1.val[2] = vaddq_f32(yvec_val, blue);
+
+    rgb2.val[0] = vaddq_f32(yyvec_val, red);
+    rgb2.val[1] = vaddq_f32(yyvec_val, green);
+    rgb2.val[2] = vaddq_f32(yyvec_val, blue);
+
+    uint8x8x3_t u8_rgb;
+    convert_float32x4x3_to_uint8x8x3(rgb1, rgb2, u8_rgb);
+
+    if(!alpha)
+    {
+        vst3_lane_u8(&output_ptr[0], u8_rgb, 0);
+        vst3_lane_u8(&output_ptr[3], u8_rgb, 4);
+        vst3_lane_u8(&output_ptr[6], u8_rgb, 1);
+        vst3_lane_u8(&output_ptr[9], u8_rgb, 5);
+        vst3_lane_u8(&output_ptr[12], u8_rgb, 2);
+        vst3_lane_u8(&output_ptr[15], u8_rgb, 6);
+        vst3_lane_u8(&output_ptr[18], u8_rgb, 3);
+        vst3_lane_u8(&output_ptr[21], u8_rgb, 7);
+    }
+    else
+    {
+        uint8x8x4_t u8_rgba;
+        u8_rgba.val[0] = u8_rgb.val[0];
+        u8_rgba.val[1] = u8_rgb.val[1];
+        u8_rgba.val[2] = u8_rgb.val[2];
+        u8_rgba.val[3] = vdup_n_u8(255);
+        vst4_lane_u8(&output_ptr[0], u8_rgba, 0);
+        vst4_lane_u8(&output_ptr[4], u8_rgba, 4);
+        vst4_lane_u8(&output_ptr[8], u8_rgba, 1);
+        vst4_lane_u8(&output_ptr[12], u8_rgba, 5);
+        vst4_lane_u8(&output_ptr[16], u8_rgba, 2);
+        vst4_lane_u8(&output_ptr[20], u8_rgba, 6);
+        vst4_lane_u8(&output_ptr[24], u8_rgba, 3);
+        vst4_lane_u8(&output_ptr[28], u8_rgba, 7);
+    }
+}
+
+inline uint8x16x3_t load_rgb(const unsigned char *const ptr, const bool alpha)
+{
+    uint8x16x3_t rgb;
+
+    if(alpha)
+    {
+        const auto tmp = vld4q_u8(ptr);
+        rgb.val[0]     = tmp.val[0];
+        rgb.val[1]     = tmp.val[1];
+        rgb.val[2]     = tmp.val[2];
+    }
+    else
+    {
+        rgb = vld3q_u8(ptr);
+    }
+
+    return rgb;
+}
+
+inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_bottom)
+{
+    // Convert the uint8x16_t to float32x4x4_t
+    float32x4x4_t frvec_top, fgvec_top, fbvec_top;
+    convert_uint8x16_to_float32x4x4(vec_top.val[0], frvec_top);
+    convert_uint8x16_to_float32x4x4(vec_top.val[1], fgvec_top);
+    convert_uint8x16_to_float32x4x4(vec_top.val[2], fbvec_top);
+
+    float32x4x4_t frvec_bottom, fgvec_bottom, fbvec_bottom;
+    convert_uint8x16_to_float32x4x4(vec_bottom.val[0], frvec_bottom);
+    convert_uint8x16_to_float32x4x4(vec_bottom.val[1], fgvec_bottom);
+    convert_uint8x16_to_float32x4x4(vec_bottom.val[2], fbvec_bottom);
+
+    float32x4x4_t fyvec_top, fuvec_top, fvvec_top;
+    float32x4x4_t fyvec_bottom, fuvec_bottom, fvvec_bottom;
+
+    for(auto i = 0; i < 4; ++i)
+    {
+        rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i],
+                               fyvec_top.val[i], fuvec_top.val[i], fvvec_top.val[i]);
+        rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i],
+                               fyvec_bottom.val[i], fuvec_bottom.val[i], fvvec_bottom.val[i]);
+    }
+
+    convert_float32x4x4_to_unit8x16(fyvec_top, vec_top.val[0]);
+    convert_float32x4x4_to_unit8x16(fuvec_top, vec_top.val[1]);
+    convert_float32x4x4_to_unit8x16(fvvec_top, vec_top.val[2]);
+    convert_float32x4x4_to_unit8x16(fyvec_bottom, vec_bottom.val[0]);
+    convert_float32x4x4_to_unit8x16(fuvec_bottom, vec_bottom.val[1]);
+    convert_float32x4x4_to_unit8x16(fvvec_bottom, vec_bottom.val[2]);
+}
+
+inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
+                              const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
+                              unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom,
+                              unsigned char *const __restrict out_uv)
+{
+    uint8x16x3_t vec_top, vec_bottom;
+    vec_top.val[0]    = rvec_top;
+    vec_top.val[1]    = gvec_top;
+    vec_top.val[2]    = bvec_top;
+    vec_bottom.val[0] = rvec_bottom;
+    vec_bottom.val[1] = gvec_bottom;
+    vec_bottom.val[2] = bvec_bottom;
+
+    rgb_to_yuv_conversion(vec_top, vec_bottom);
+
+    vst1q_u8(out_y_top, vec_top.val[0]);
+    vst1q_u8(out_y_bottom, vec_bottom.val[0]);
+
+    const auto uvec = vuzpq_u8(vec_top.val[1], vec_bottom.val[1]);
+    const auto vvec = vuzpq_u8(vec_top.val[2], vec_bottom.val[2]);
+    const auto utmp = vrhaddq_u8(uvec.val[0], uvec.val[1]);
+    const auto vtmp = vrhaddq_u8(vvec.val[0], vvec.val[1]);
+
+    uint8x8x2_t uvvec;
+    uvvec.val[0] = vhadd_u8(vget_low_u8(utmp), vget_high_u8(utmp));
+    uvvec.val[1] = vhadd_u8(vget_low_u8(vtmp), vget_high_u8(vtmp));
+
+    vst2_u8(out_uv, uvvec);
+}
+
+inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
+                              const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
+                              unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom,
+                              unsigned char *const __restrict out_u,
+                              unsigned char *const __restrict out_v)
+{
+    uint8x16x3_t vec_top, vec_bottom;
+    vec_top.val[0]    = rvec_top;
+    vec_top.val[1]    = gvec_top;
+    vec_top.val[2]    = bvec_top;
+    vec_bottom.val[0] = rvec_bottom;
+    vec_bottom.val[1] = gvec_bottom;
+    vec_bottom.val[2] = bvec_bottom;
+
+    rgb_to_yuv_conversion(vec_top, vec_bottom);
+
+    vst1q_u8(out_y_top, vec_top.val[0]);
+    vst1q_u8(out_y_bottom, vec_bottom.val[0]);
+
+    const auto uvvec_top    = vuzpq_u8(vec_top.val[1], vec_top.val[2]);
+    const auto uvvec_bottom = vuzpq_u8(vec_bottom.val[1], vec_bottom.val[2]);
+    const auto uvvec        = vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]),
+                                        vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1]));
+
+    vst1_u8(out_u, vget_low_u8(uvvec));
+    vst1_u8(out_v, vget_high_u8(uvvec));
+}
+
+inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, const uint8x16_t &bvec,
+                              unsigned char *const __restrict out_y,
+                              unsigned char *const __restrict out_u,
+                              unsigned char *const __restrict out_v)
+{
+    // Convert the uint8x16_t to float32x4x4_t
+    float32x4x4_t frvec, fgvec, fbvec;
+    convert_uint8x16_to_float32x4x4(rvec, frvec);
+    convert_uint8x16_to_float32x4x4(gvec, fgvec);
+    convert_uint8x16_to_float32x4x4(bvec, fbvec);
+
+    float32x4x4_t fyvec, fuvec, fvvec;
+    for(auto i = 0; i < 4; ++i)
+    {
+        rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i],
+                               fyvec.val[i], fuvec.val[i], fvvec.val[i]);
+    }
+
+    uint8x16_t yvec, uvec, vvec;
+    convert_float32x4x4_to_unit8x16(fyvec, yvec);
+    convert_float32x4x4_to_unit8x16(fuvec, uvec);
+    convert_float32x4x4_to_unit8x16(fvvec, vvec);
+
+    vst1q_u8(out_y, yvec);
+    vst1q_u8(out_u, uvec);
+    vst1q_u8(out_v, vvec);
+}
+}
+
+namespace arm_compute
+{
+void colorconvert_rgb_to_rgbx(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IImage *__restrict>(output);
+
+    Iterator in(input_ptr, win);
+    Iterator out(output_ptr, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto   ta1 = vld3q_u8(in.ptr());
+        uint8x16x4_t ta2;
+        ta2.val[0] = ta1.val[0];
+        ta2.val[1] = ta1.val[1];
+        ta2.val[2] = ta1.val[2];
+        ta2.val[3] = vdupq_n_u8(255);
+        vst4q_u8(out.ptr(), ta2);
+    },
+    in, out);
+}
+
+void colorconvert_rgbx_to_rgb(const void *input, void *output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IImage *__restrict>(output);
+
+    Iterator in(input_ptr, win);
+    Iterator out(output_ptr, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto   ta1 = vld4q_u8(in.ptr());
+        uint8x16x3_t ta2;
+        ta2.val[0] = ta1.val[0];
+        ta2.val[1] = ta1.val[1];
+        ta2.val[2] = ta1.val[2];
+        vst3q_u8(out.ptr(), ta2);
+    },
+    in, out);
+}
+
+template <bool yuyv, bool alpha>
+void colorconvert_yuyv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IImage *__restrict>(output);
+
+    constexpr auto element_size = alpha ? 32 : 24;
+    constexpr auto shift        = yuyv ? 0 : 1;
+
+    Iterator in(input_ptr, win);
+    Iterator out(output_ptr, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        float32x4x4_t uvec, yvec, vvec, yyvec;
+        const auto    ta = vld4q_u8(in.ptr());
+        //ta.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta.val[1] = U0 U2 U4 U6 ...
+        //ta.val[2] = Y1 Y3 Y5 Y7 ...
+        //ta.val[3] = V0 V2 V4 V7 ...
+
+        // Convert the uint8x16x4_t to float32x4x4_t
+        convert_uint8x16_to_float32x4x4(ta.val[0 + shift], yvec);
+        convert_uint8x16_to_float32x4x4(ta.val[1 - shift], uvec);
+        convert_uint8x16_to_float32x4x4(ta.val[2 + shift], yyvec);
+        convert_uint8x16_to_float32x4x4(ta.val[3 - shift], vvec);
+
+        yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
+    },
+    in, out);
+}
+
+template <bool uv, bool alpha>
+void colorconvert_nv12_to_rgb(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
+    const auto output_ptr = static_cast<IImage *__restrict>(output);
+
+    constexpr auto element_size = alpha ? 32 : 24;
+    const auto     out_stride   = output_ptr->info()->strides_in_bytes().y();
+    constexpr auto shift        = uv ? 0 : 1;
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in_y(input_ptr->plane(0), win);
+    Iterator in_uv(input_ptr->plane(1), win_uv);
+    Iterator out(output_ptr, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_y_top    = vld2q_u8(in_y.ptr());
+        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+        const auto ta_uv       = vld2q_u8(in_uv.ptr());
+        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+        //ta_uv.val[0] = U0 U2 U4 U6 ...
+        //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+        // Convert the uint8x16x4_t to float32x4x4_t
+        float32x4x4_t yvec_top, yyvec_top, yvec_bottom, yyvec_bottom, uvec, vvec;
+        convert_uint8x16_to_float32x4x4(ta_y_top.val[0], yvec_top);
+        convert_uint8x16_to_float32x4x4(ta_y_top.val[1], yyvec_top);
+        convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0], yvec_bottom);
+        convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1], yyvec_bottom);
+        convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift], uvec);
+        convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift], vvec);
+
+        yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
+
+        yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
+    },
+    in_y, in_uv, out);
+}
+
+template <bool alpha>
+void colorconvert_iyuv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
+    const auto output_ptr = static_cast<IImage *__restrict>(output);
+
+    constexpr auto element_size = alpha ? 32 : 24;
+    const auto     out_stride   = output_ptr->info()->strides_in_bytes().y();
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in_y(input_ptr->plane(0), win);
+    Iterator in_u(input_ptr->plane(1), win_uv);
+    Iterator in_v(input_ptr->plane(2), win_uv);
+    Iterator out(output_ptr, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_y_top    = vld2q_u8(in_y.ptr());
+        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+        const auto ta_u        = vld1q_u8(in_u.ptr());
+        const auto ta_v        = vld1q_u8(in_v.ptr());
+        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+        //ta_u.val[0] = U0 U2 U4 U6 ...
+        //ta_v.val[0] = V0 V2 V4 V6 ...
+
+        // Convert the uint8x16x4_t to float32x4x4_t
+        float32x4x4_t yvec_top, yyvec_top, yvec_bottom, yyvec_bottom, uvec, vvec;
+        convert_uint8x16_to_float32x4x4(ta_y_top.val[0], yvec_top);
+        convert_uint8x16_to_float32x4x4(ta_y_top.val[1], yyvec_top);
+        convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0], yvec_bottom);
+        convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1], yyvec_bottom);
+        convert_uint8x16_to_float32x4x4(ta_u, uvec);
+        convert_uint8x16_to_float32x4x4(ta_v, vvec);
+
+        yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
+
+        yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
+    },
+    in_y, in_u, in_v, out);
+}
+
+template <bool yuyv>
+void colorconvert_yuyv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    constexpr auto shift = yuyv ? 0 : 1;
+
+    // NV12's UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in(input_ptr, win);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_uv(output_ptr->plane(1), win_uv);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_top    = vld4q_u8(in.ptr());
+        const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
+        //ta.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta.val[1] = U0 U2 U4 U6 ...
+        //ta.val[2] = Y1 Y3 Y5 Y7 ...
+        //ta.val[3] = V0 V2 V4 V7 ...
+
+        uint8x16x2_t yvec;
+        yvec.val[0] = ta_top.val[0 + shift];
+        yvec.val[1] = ta_top.val[2 + shift];
+        vst2q_u8(out_y.ptr(), yvec);
+
+        uint8x16x2_t yyvec;
+        yyvec.val[0] = ta_bottom.val[0 + shift];
+        yyvec.val[1] = ta_bottom.val[2 + shift];
+        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
+
+        uint8x16x2_t uvvec;
+        uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
+        uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
+        vst2q_u8(out_uv.ptr(), uvvec);
+    },
+    in, out_y, out_uv);
+}
+
+void colorconvert_iyuv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in_y(input_ptr->plane(0), win);
+    Iterator in_u(input_ptr->plane(1), win_uv);
+    Iterator in_v(input_ptr->plane(2), win_uv);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_uv(output_ptr->plane(1), win_uv);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto   ta_y_top    = vld2q_u8(in_y.ptr());
+        const auto   ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+        uint8x16x2_t ta_uv;
+        ta_uv.val[0] = vld1q_u8(in_u.ptr());
+        ta_uv.val[1] = vld1q_u8(in_v.ptr());
+        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+        //ta_uv.val[0] = U0 U2 U4 U6 ...
+        //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+        vst2q_u8(out_y.ptr(), ta_y_top);
+        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+        vst2q_u8(out_uv.ptr(), ta_uv);
+    },
+    in_y, in_u, in_v, out_y, out_uv);
+}
+
+template <bool uv>
+void colorconvert_nv12_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    constexpr auto shift = uv ? 0 : 1;
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in_y(input_ptr->plane(0), win);
+    Iterator in_uv(input_ptr->plane(1), win_uv);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_u(output_ptr->plane(1), win_uv);
+    Iterator out_v(output_ptr->plane(2), win_uv);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_y_top    = vld2q_u8(in_y.ptr());
+        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+        const auto ta_uv       = vld2q_u8(in_uv.ptr());
+        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+        //ta_uv.val[0] = U0 U2 U4 U6 ...
+        //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+        vst2q_u8(out_y.ptr(), ta_y_top);
+        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+        vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]);
+        vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]);
+    },
+    in_y, in_uv, out_y, out_u, out_v);
+}
+
+template <bool yuyv>
+void colorconvert_yuyv_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    constexpr auto shift = yuyv ? 0 : 1;
+
+    // Destination's UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in(input_ptr, win);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_u(output_ptr->plane(1), win_uv);
+    Iterator out_v(output_ptr->plane(2), win_uv);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_top    = vld4q_u8(in.ptr());
+        const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
+        //ta.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta.val[1] = U0 U2 U4 U6 ...
+        //ta.val[2] = Y1 Y3 Y5 Y7 ...
+        //ta.val[3] = V0 V2 V4 V7 ...
+
+        uint8x16x2_t yvec;
+        yvec.val[0] = ta_top.val[0 + shift];
+        yvec.val[1] = ta_top.val[2 + shift];
+        vst2q_u8(out_y.ptr(), yvec);
+
+        uint8x16x2_t yyvec;
+        yyvec.val[0] = ta_bottom.val[0 + shift];
+        yyvec.val[1] = ta_bottom.val[2 + shift];
+        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
+
+        uint8x16_t uvec;
+        uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
+        vst1q_u8(out_u.ptr(), uvec);
+
+        uint8x16_t vvec;
+        vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
+        vst1q_u8(out_v.ptr(), vvec);
+    },
+    in, out_y, out_u, out_v);
+}
+
+template <bool uv>
+void colorconvert_nv12_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    constexpr auto shift = uv ? 0 : 1;
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in_y(input_ptr->plane(0), win);
+    Iterator in_uv(input_ptr->plane(1), win_uv);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_u(output_ptr->plane(1), win);
+    Iterator out_v(output_ptr->plane(2), win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_y_top    = vld2q_u8(in_y.ptr());
+        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+        const auto ta_uv       = vld2q_u8(in_uv.ptr());
+        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+        //ta_uv.val[0] = U0 U2 U4 U6 ...
+        //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+        vst2q_u8(out_y.ptr(), ta_y_top);
+        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+
+        uint8x16x2_t uvec;
+        uvec.val[0] = ta_uv.val[0 + shift];
+        uvec.val[1] = ta_uv.val[0 + shift];
+        vst2q_u8(out_u.ptr(), uvec);
+        vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
+
+        uint8x16x2_t vvec;
+        vvec.val[0] = ta_uv.val[1 - shift];
+        vvec.val[1] = ta_uv.val[1 - shift];
+        vst2q_u8(out_v.ptr(), vvec);
+        vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
+    },
+    in_y, in_uv, out_y, out_u, out_v);
+}
+
+void colorconvert_iyuv_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in_y(input_ptr->plane(0), win);
+    Iterator in_u(input_ptr->plane(1), win_uv);
+    Iterator in_v(input_ptr->plane(2), win_uv);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_u(output_ptr->plane(1), win);
+    Iterator out_v(output_ptr->plane(2), win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_y_top    = vld2q_u8(in_y.ptr());
+        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+        const auto ta_u        = vld1q_u8(in_u.ptr());
+        const auto ta_v        = vld1q_u8(in_v.ptr());
+        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+        //ta_u = U0 U2 U4 U6 ...
+        //ta_v = V0 V2 V4 V6 ...
+
+        vst2q_u8(out_y.ptr(), ta_y_top);
+        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+
+        uint8x16x2_t uvec;
+        uvec.val[0] = ta_u;
+        uvec.val[1] = ta_u;
+        vst2q_u8(out_u.ptr(), uvec);
+        vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
+
+        uint8x16x2_t vvec;
+        vvec.val[0] = ta_v;
+        vvec.val[1] = ta_v;
+        vst2q_u8(out_v.ptr(), vvec);
+        vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
+    },
+    in_y, in_u, in_v, out_y, out_u, out_v);
+}
+
+template <bool alpha>
+void colorconvert_rgb_to_nv12(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in(input_ptr, win);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_uv(output_ptr->plane(1), win_uv);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
+        const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
+        //ta_rgb.val[0] = R0 R1 R2 R3 ...
+        //ta_rgb.val[1] = G0 G1 G2 G3 ...
+        //ta_rgb.val[2] = B0 B1 B2 B3 ...
+
+        store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
+                          ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
+                          out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
+                          out_uv.ptr());
+    },
+    in, out_y, out_uv);
+}
+
+template <bool alpha>
+void colorconvert_rgb_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in(input_ptr, win);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_u(output_ptr->plane(1), win_uv);
+    Iterator out_v(output_ptr->plane(2), win_uv);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
+        const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
+        //ta_rgb.val[0] = R0 R1 R2 R3 ...
+        //ta_rgb.val[1] = G0 G1 G2 G3 ...
+        //ta_rgb.val[2] = B0 B1 B2 B3 ...
+
+        store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
+                          ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
+                          out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
+                          out_u.ptr(), out_v.ptr());
+    },
+    in, out_y, out_u, out_v);
+}
+
+template <bool alpha>
+void colorconvert_rgb_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    Iterator in(input_ptr, win);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_u(output_ptr->plane(1), win);
+    Iterator out_v(output_ptr->plane(2), win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_rgb = load_rgb(in.ptr(), alpha);
+        //ta_rgb.val[0] = R0 R1 R2 R3 ...
+        //ta_rgb.val[1] = G0 G1 G2 G3 ...
+        //ta_rgb.val[2] = B0 B1 B2 B3 ...
+
+        store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2],
+                          out_y.ptr(), out_u.ptr(), out_v.ptr());
+    },
+    in, out_y, out_u, out_v);
+}
+}

diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h
new file mode 100644
index 0000000..fb71261
--- /dev/null
+++ b/arm_compute/core/NEON/NEFixedPoint.h

@@ -0,0 +1,686 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFIXEDPOINT_H__
+#define __ARM_COMPUTE_NEFIXEDPOINT_H__
+
+#include "arm_compute/core/FixedPoint.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+using qint8x8_t    = int8x8_t;    /**< 8 bit fixed point vector with 8 elements */
+using qint8x8x2_t  = int8x8x2_t;  /**< 8 bit fixed point vector with 16 elements */
+using qint8x8x3_t  = int8x8x3_t;  /**< 8 bit fixed point vector with 24 elements */
+using qint8x8x4_t  = int8x8x4_t;  /**< 8 bit fixed point vector with 32 elements */
+using qint8x16_t   = int8x16_t;   /**< 8 bit fixed point vector with 16 elements */
+using qint8x16x2_t = int8x16x2_t; /**< 8 bit fixed point vector with 32 elements */
+using qint8x16x3_t = int8x16x3_t; /**< 8 bit fixed point vector with 48 elements */
+using qint8x16x4_t = int8x16x4_t; /**< 8 bit fixed point vector with 64 elements */
+using qint16x4_t   = int16x4_t;   /**< 16 bit fixed point vector with 4 elements */
+using qint16x4x2_t = int16x4x2_t; /**< 16 bit fixed point vector with 8 elements */
+using qint16x4x3_t = int16x4x3_t; /**< 16 bit fixed point vector with 12 elements */
+using qint16x4x4_t = int16x4x4_t; /**< 16 bit fixed point vector with 16 elements */
+using qint16x8_t   = int16x8_t;   /**< 16 bit fixed point vector with 8 elements */
+using qint16x8x2_t = int16x8x2_t; /**< 16 bit fixed point vector with 16 elements */
+using qint16x8x3_t = int16x8x3_t; /**< 16 bit fixed point vector with 24 elements */
+using qint16x8x4_t = int16x8x4_t; /**< 16 bit fixed point vector with 32 elements */
+
+/** Get the lower half of a 16 elements vector
+ *
+ * @param[in] a vector of 16 elements
+ *
+ * @return 8 bit fixed point vector (8 elements)
+ */
+qint8x8_t vget_low_qs8(qint8x16_t a);
+
+/** Get the higher half of a 16 elements vector
+ *
+ * @param[in] a vector of 16 elements
+ *
+ * @return 8 bit fixed point vector (8 elements)
+ */
+qint8x8_t vget_high_qs8(qint8x16_t a);
+
+/** Load a single 8 bit fixed point vector from memory (8 elements)
+ *
+ * @param[in] addr Memory address of the 8 bit fixed point vector to load
+ *
+ * @return 8 bit fixed point vector (8 elements)
+ */
+qint8x8_t vld1_qs8(const qint8_t *addr);
+
+/** Load a single 8 bit fixed point vector from memory (16 elements)
+ *
+ * @param[in] addr Memory address of the 8 bit fixed point vector to load
+ *
+ * @return 8 bit fixed point vector (16 elements)
+ */
+qint8x16_t vld1q_qs8(const qint8_t *addr);
+
+/** Load a single 16 bit fixed point vector from memory (4 elements)
+ *
+ * @param[in] addr Memory address of the 16 bit fixed point vector to load
+ *
+ * @return 16 bit fixed point vector (4 elements)
+ */
+qint16x4_t vld1_qs16(const qint16_t *addr);
+
+/** Load a single 16 bit fixed point vector from memory (8 elements)
+ *
+ * @param[in] addr Memory address of the 16 bit fixed point vector to load
+ *
+ * @return 16 bit fixed point vector (8 elements)
+ */
+qint16x8_t vld1q_qs16(const qint16_t *addr);
+
+/** Load all lanes of 8 bit fixed point vector with same value from memory (8 elements)
+ *
+ * @param[in] addr Memory address of the 8 bit fixed point scalar value to load
+ *
+ * @return 8 bit fixed point vector (8 elements)
+ */
+qint8x8_t vld1_dup_qs8(const qint8_t *addr);
+
+/** Load all lanes of 8 bit fixed point vector with same value from memory (16 elements)
+ *
+ * @param[in] addr Memory address of the 8 bit fixed point scalar value to load
+ *
+ * @return 8 bit fixed point vector (16 elements)
+ */
+qint8x16_t vld1q_dup_qs8(const qint8_t *addr);
+
+/** Store a single 8 bit fixed point vector to memory (8 elements)
+ *
+ * @param[in] addr Memory address where the 8 bit fixed point vector should be stored
+ * @param[in] b    8 bit fixed point vector to store
+ *
+ */
+void vst1_qs8(qint8_t *addr, qint8x8_t b);
+
+/** Store a single 8 bit fixed point vector to memory (16 elements)
+ *
+ * @param[in] addr Memory address where the 8 bit fixed point vector should be stored
+ * @param[in] b    8 bit fixed point vector to store
+ *
+ */
+void vst1q_qs8(qint8_t *addr, qint8x16_t b);
+
+/** Store a single 16 bit fixed point vector to memory (4 elements)
+ *
+ * @param[in] addr Memory address where the 16 bit fixed point vector should be stored
+ * @param[in] b    16 bit fixed point vector to store
+ *
+ */
+void vst1_qs16(qint16_t *addr, qint16x4_t b);
+
+/** Store a single 8 bit fixed point vector to memory (16 elements)
+ *
+ * @param[in] addr Memory address where the 16 bit fixed point vector should be stored
+ * @param[in] b    16 bit fixed point vector to store
+ *
+ */
+void vst1q_qs16(qint16_t *addr, qint16x8_t b);
+
+/** 16 bit fixed point vector saturating narrow (8 elements)
+ *
+ * @param[in] a 16 bit fixed point vector to convert
+ *
+ * @return 8 bit fixed point vector
+ */
+qint8x8_t vqmovn_q16(qint16x8_t a);
+
+/** 8 bit fixed point vector duplicate (8 elements)
+ *
+ * @param[in] a 8 bit fixed point to duplicate
+ *
+ * @return The result of the vector duplication
+ */
+qint8x8_t vdup_n_qs8(qint8_t a);
+
+/** 8 bit fixed point vector duplicate (16 elements)
+ *
+ * @param[in] a 8 bit fixed point to duplicate
+ *
+ * @return The result of the vector duplication
+ */
+qint8x16_t vdupq_n_qs8(qint8_t a);
+
+/** Duplicate a float and convert it to 8 bit fixed point vector (16 elements)
+ *
+ * @param[in] a                    8 bit fixed point to duplicate
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the vector duplication
+ */
+qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position);
+
+/** 16 bit fixed point vector duplicate (8 elements)
+ *
+ * @param[in] a 16 bit fixed point to duplicate
+ *
+ * @return The result of the vector duplication
+ */
+qint16x8_t vdupq_n_qs16(qint16x8_t a);
+
+/** Absolute value of 8 bit fixed point vector (8 elements)
+ *
+ * @param[in] a 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector absolute value
+ */
+qint8x8_t vabs_qs8(qint8x8_t a);
+
+/** Absolute value of 8 bit fixed point vector (16 elements)
+ *
+ * @param[in] a 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector absolute value
+ */
+qint8x16_t vabsq_qs8(qint8x16_t a);
+
+/** Saturating absolute value of 8 bit fixed point vector (8 elements)
+ *
+ * @param[in] a 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector absolute value
+ */
+qint8x8_t vqabs_qs8(qint8x8_t a);
+
+/** Saturating absolute value of 8 bit fixed point vector (16 elements)
+ *
+ * @param[in] a 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector absolute value
+ */
+qint8x16_t vqabsq_qs8(qint8x16_t a);
+
+/** 8 bit fixed point vector max (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector max operation
+ */
+qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector max (16 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector max operation
+ */
+qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b);
+
+/** 8 bit fixed point vector pairwise max (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector pairwise max operation
+ */
+qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector min (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector max operation
+ */
+qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector min (16 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector min operation
+ */
+qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b);
+
+/** 8 bit fixed point vector pairwise min (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector pairwise min operation
+ */
+qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector add (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector addition
+ */
+qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector add (16 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector addition
+ */
+qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b);
+
+/** 8 bit fixed point vector saturating add (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector addition. The result is saturated in case of overflow
+ */
+qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector saturating add (16 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector addition. The result is saturated in case of overflow
+ */
+qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b);
+
+/** 16 bit fixed point vector saturating add (4 elements)
+ *
+ * @param[in] a First 16 bit fixed point input vector
+ * @param[in] b Second 16 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector addition. The result is saturated in case of overflow
+ */
+qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b);
+
+/** 16 bit fixed point vector saturating add (8 elements)
+ *
+ * @param[in] a First 16 bit fixed point input vector
+ * @param[in] b Second 16 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector addition. The result is saturated in case of overflow
+ */
+qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b);
+
+/** 8 bit fixed point vector saturating pairwise add (8 elements)
+ *
+ * @param[in] a 8 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector addition. The result is saturated in case of overflow
+ */
+int16x4_t vpaddl_qs8(qint8x8_t a);
+
+/** 8 bit fixed point vector subtraction (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector subtraction
+ */
+qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector subtraction (16 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector subtraction
+ */
+qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b);
+
+/** 8 bit fixed point vector saturating subtraction (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector subtraction. The result is saturated in case of overflow
+ */
+qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector saturating subtraction (16 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector subtraction. The result is saturated in case of overflow
+ */
+qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b);
+
+/** 8 bit fixed point vector multiply (8 elements)
+ *
+ * @param[in] a                    First 8 bit fixed point input vector
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiplication.
+ */
+qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position);
+
+/** 8 bit fixed point vector multiply (16 elements)
+ *
+ * @param[in] a                    First 8 bit fixed point input vector
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiplication.
+ */
+qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position);
+
+/** 8 bit fixed point vector saturating multiply (8 elements)
+ *
+ * @param[in] a                    First 8 bit fixed point input vector
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiplication. The result is saturated in case of overflow
+ */
+qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position);
+
+/** 8 bit fixed point vector saturating multiply (16 elements)
+ *
+ * @param[in] a                    First 8 bit fixed point input vector
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiplication. The result is saturated in case of overflow
+ */
+qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position);
+
+/** 8 bit fixed point vector long multiply (8 elements)
+ *
+ * @param[in] a                    First 8 bit fixed point input vector
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point long vector multiplication.
+ */
+qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position);
+
+/** 8 bit fixed point vector multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
+ *
+ * @param[in] a                    First 8 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] c                    Third 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiply-accumulate
+ */
+qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
+
+/** 8 bit fixed point vector multiply-accumulate (16 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
+ *
+ * @param[in] a                    First 8 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] c                    Third 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiply-accumulate
+ */
+qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position);
+
+/** 8 bit fixed point vector saturating multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
+ *
+ * @param[in] a                    First 8 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] c                    Third 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiply-accumulate. The result is saturated in case of overflow
+ */
+qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
+
+/** 8 bit fixed point vector saturating multiply-accumulate (16 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
+ *
+ * @param[in] a                    First 8 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] c                    Third 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiply-accumulate.The result is saturated in case of overflow
+ */
+qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position);
+
+/** 8 bit fixed point vector multiply-accumulate long (8 elements).
+ *  This operation performs the product between @p b and @p c and add the result to the 16 bit fixed point vector @p a (a + b * c). 8 elements
+ *
+ * @param[in] a                    First 16 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] c                    Third 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiply-accumulate long
+ */
+qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
+
+/** 8 bit fixed point vector saturating multiply-accumulate long (8 elements). The saturation is performed on the 16 bit fixed point output vector.
+ *  This operation performs the product between @p b and @p c and add the result to the 16 bit fixed point vector @p a (a + b * c). 8 elements
+ *
+ * @param[in] a                    First 16 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] c                    Third 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiply-accumulate long
+ */
+qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
+
+/** Convert a float vector with 4x2 elements to 8 bit fixed point vector with 8 elements
+ *
+ * @param[in] a                    Float input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion float -> 8 bit fixed point
+ */
+qint8x8_t vcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position);
+
+/** Convert a float vector with 4x4 elements to 8 bit fixed point vector with 16 elements
+ *
+ * @param[in] a                    Float input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion float -> 8 bit fixed point
+ */
+qint8x16_t vcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position);
+
+/** Convert a 8 bit fixed point vector with 8 elements to a float vector with 4x2 elements
+ *
+ * @param[in] a                    8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion 8 bit fixed point -> float32x2x4
+ */
+float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Convert a 8 bit fixed point vector with 16 elements to a float vector with 4x4 elements
+ *
+ * @param[in] a                    8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion 8 bit fixed point -> float32x4x4
+ */
+float32x4x4_t vcvtq_qs8_f32(qint8x16_t a, int fixed_point_position);
+
+/** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit reciprocal (1/a).
+ */
+qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit reciprocal (1/a).
+ */
+qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Division fixed point 8bit (8 elements)
+ *
+ * @param[in] a                    First 8bit fixed point input vector
+ * @param[in] b                    Second 8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The quotient and remainder number in fixed point format.
+ */
+qint8x8_t vdiv_qs8(qint8x8_t a, int8x8_t b, int fixed_point_position);
+
+/** Division fixed point 8bit (16 elements)
+ *
+ * @param[in] a                    First 8bit fixed point input vector
+ * @param[in] b                    Second 8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The quotient and remainder number in 8bit fixed point format.
+ */
+qint8x16_t vdivq_qs8(qint8x16_t a, int8x16_t b, int fixed_point_position);
+
+/** Perform a 4th degree polynomial approximation. (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit taylor approximation.
+ */
+template <bool islog>
+qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Perform a 4th degree polynomial approximation. (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit taylor approximation.
+ */
+template <bool islog>
+qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Calculate saturating exponential fixed point 8bit (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit saturating exponential
+ */
+qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Calculate saturating exponential fixed point 8bit (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit saturating exponential
+ */
+qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Calculate logarithm fixed point 16bit (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit logarithm.
+ */
+qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Calculate logarithm fixed point 16bit (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit logarithm.
+ */
+qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit inverse sqrt.
+ */
+qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Calculate saturating inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit inverse sqrt.
+ */
+qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit inverse sqrt.
+ */
+qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Calculate saturating inverse square root for fixed point 8bit using Newton-Raphosn method (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit inverse sqrt.
+ */
+qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Calculate hyperbolic tangent for fixed point 8bit (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The calculated Hyperbolic Tangent.
+ */
+qint8x8_t vtanh_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Calculate hyperbolic tangent for fixed point 8bit (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The calculated Hyperbolic Tangent.
+ */
+qint8x16_t vtanhq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Calculate saturating n power for fixed point 8bit (16 elements).
+ *
+ * pow(a,b) = e^(b*log(a))
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] b                    8bit fixed point power vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit power.
+ */
+qint8x8_t vqpowq_qs8(qint8x8_t a, qint8x16_t b, int fixed_point_position);
+}
+#include "arm_compute/core/NEON/NEFixedPoint.inl"
+#endif /* __ARM_COMPUTE_NEFIXEDPOINT_H__ */

diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
new file mode 100644
index 0000000..6db344d
--- /dev/null
+++ b/arm_compute/core/NEON/NEFixedPoint.inl

@@ -0,0 +1,1018 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+namespace arm_compute
+{
+/**< Exponent polynomial coefficients for 8 bit fixed point (8 elements)
+ *  Format is in Q0.7 for all elements */
+const std::array<qint8x8_t, 4> exp_tab_qs8 =
+{
+    {
+        vdup_n_s8(0x7F), // 0.9978546
+        vdup_n_s8(0x3F), // 0.4994721
+        vdup_n_s8(0x16), // 0.1763723
+        vdup_n_s8(0x05), // 0.0435108
+    }
+};
+
+/**< Exponent polynomial coefficients for 8 bit fixed point (16 elements)
+ * Format is in Q0.7 for all elements */
+const std::array<qint8x16_t, 4> exp_tabq_qs8 =
+{
+    {
+        vdupq_n_s8(0x7F), // 0.9978546
+        vdupq_n_s8(0x3F), // 0.4994721
+        vdupq_n_s8(0x16), // 0.1763723
+        vdupq_n_s8(0x05), // 0.0435108
+    }
+};
+
+/**< Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
+ * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
+const std::array<qint8x8_t, 4> log_tab_qs8 =
+{
+    {
+        vdup_n_s8(0x5C),  // 1.4384189
+        vdup_n_s8(-0x56), // -0.6771900
+        vdup_n_s8(0x29),  // 0.3218538
+        vdup_n_s8(-0x0A), // -0.0832229
+    }
+};
+
+/**< Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
+ * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
+const std::array<qint8x16_t, 4> log_tabq_qs8 =
+{
+    {
+        vdupq_n_s8(0x5C),  // 1.4384189
+        vdupq_n_s8(-0x56), // -0.6771900
+        vdupq_n_s8(0x29),  // 0.3218538
+        vdupq_n_s8(-0x0A), // -0.0832229
+    }
+};
+
+inline qint8x8_t vget_low_qs8(qint8x16_t a)
+{
+    return vget_low_s8(a);
+}
+
+inline qint8x8_t vget_high_qs8(qint8x16_t a)
+{
+    return vget_high_s8(a);
+}
+
+inline qint8x8_t vld1_qs8(const qint8_t *addr)
+{
+    return vld1_s8(addr);
+}
+
+inline qint8x16_t vld1q_qs8(const qint8_t *addr)
+{
+    return vld1q_s8(addr);
+}
+
+inline qint16x4_t vld1_qs16(const qint16_t *addr)
+{
+    return vld1_s16(addr);
+}
+
+inline qint16x8_t vld1q_qs16(const qint16_t *addr)
+{
+    return vld1q_s16(addr);
+}
+
+inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
+{
+    return vld1_dup_s8(addr);
+}
+
+inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
+{
+    return vld1q_dup_s8(addr);
+}
+
+inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
+{
+    vst1_s8(addr, b);
+}
+
+inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
+{
+    vst1q_s8(addr, b);
+}
+
+inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
+{
+    vst1_s16(addr, b);
+}
+
+inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
+{
+    vst1q_s16(addr, b);
+}
+
+inline qint8x8_t vqmovn_qs16(qint16x8_t a)
+{
+    return vqmovn_s16(a);
+}
+
+inline qint8x8_t vdup_n_qs8(qint8_t a)
+{
+    return vdup_n_s8(a);
+}
+
+inline qint8x16_t vdupq_n_qs8(qint8_t a)
+{
+    return vdupq_n_s8(a);
+}
+
+inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
+{
+    float32x4x4_t res =
+    {
+        {
+            vdupq_n_f32(a),
+            vdupq_n_f32(a),
+            vdupq_n_f32(a),
+            vdupq_n_f32(a),
+        }
+    };
+    return vcvtq_qs8_f32(res, fixed_point_position);
+}
+
+inline qint16x8_t vdupq_n_qs16(qint16_t a)
+{
+    return vdupq_n_s16(a);
+}
+
+inline qint8x8_t vabs_qs8(qint8x8_t a)
+{
+    return vabs_s8(a);
+}
+
+inline qint8x16_t vabsq_qs8(qint8x16_t a)
+{
+    return vabsq_s8(a);
+}
+
+inline qint8x8_t vqabs_qs8(qint8x8_t a)
+{
+    return vqabs_s8(a);
+}
+
+inline qint8x16_t vqabsq_qs8(qint8x16_t a)
+{
+    return vqabsq_s8(a);
+}
+
+inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vmax_s8(a, b);
+}
+
+inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
+{
+    return vmaxq_s8(a, b);
+}
+
+inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vpmax_s8(a, b);
+}
+
+inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vmin_s8(a, b);
+}
+
+inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
+{
+    return vminq_s8(a, b);
+}
+
+inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vpmin_s8(a, b);
+}
+
+inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vadd_s8(a, b);
+}
+
+inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
+{
+    return vaddq_s8(a, b);
+}
+
+inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vqadd_s8(a, b);
+}
+
+inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
+{
+    return vqaddq_s8(a, b);
+}
+
+inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
+{
+    return vqadd_s16(a, b);
+}
+
+inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
+{
+    return vqaddq_s16(a, b);
+}
+
+inline int16x4_t vpaddl_qs8(qint8x8_t a)
+{
+    return vpaddl_s8(a);
+}
+
+inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vsub_s8(a, b);
+}
+
+inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
+{
+    return vsubq_s8(a, b);
+}
+
+inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vqsub_s8(a, b);
+}
+
+inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
+{
+    return vqsubq_s8(a, b);
+}
+
+inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary result with a constant used to round up the result
+    qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    res = vmlal_s8(res, a, b);
+
+    // Shift right by fixed_point_position
+    res = vshlq_s16(res, fixed_point_position_s16);
+
+    // Convert back to qint8
+    return vmovn_s16(res);
+}
+
+inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
+    qint16x8_t res1 = res0;
+
+    // Vector multiply-accumulate long
+    res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
+    res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
+
+    // Shift right by fixed_point_position
+    res0 = vshlq_s16(res0, fixed_point_position_s16);
+    res1 = vshlq_s16(res1, fixed_point_position_s16);
+
+    // Convert back to qint8
+    return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
+}
+
+inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary result with a constant used to round up the result
+    qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    res = vmlal_s8(res, a, b);
+
+    // Shift right by fixed_point_position
+    res = vqshlq_s16(res, fixed_point_position_s16);
+
+    // Convert back to qint8 and saturate
+    return vqmovn_s16(res);
+}
+
+inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
+    qint16x8_t res1 = res0;
+
+    // Vector multiply-accumulate long
+    res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
+    res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
+
+    // Shift right by fixed_point_position
+    res0 = vqshlq_s16(res0, fixed_point_position_s16);
+    res1 = vqshlq_s16(res1, fixed_point_position_s16);
+
+    // Convert back to qint8 and saturate
+    return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
+}
+
+inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    qint16x8_t res = vmull_s8(a, b);
+
+    return vqrshlq_s16(res, fixed_point_position_s16);
+}
+
+inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    tmp = vmlal_s8(tmp, b, c);
+
+    // Shift right by fixed_point_position
+    tmp = vshlq_s16(tmp, fixed_point_position_s16);
+
+    // Convert back to qint8 and accumulate
+    return vadd_s8(a, vmovn_s16(tmp));
+}
+
+inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
+    qint16x8_t tmp1 = tmp0;
+
+    // Vector multiply-accumulate long
+    tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
+    tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
+
+    // Shift right by fixed_point_position
+    tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
+    tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
+
+    // Convert back to qint8 and accumulate
+    return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
+}
+
+inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    tmp = vmlal_s8(tmp, b, c);
+
+    // Shift right by fixed_point_position
+    tmp = vqshlq_s16(tmp, fixed_point_position_s16);
+
+    // Convert back to qint8 and accumulate
+    return vqadd_s8(a, vqmovn_s16(tmp));
+}
+
+inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
+    qint16x8_t tmp1 = tmp0;
+
+    // Vector multiply-accumulate long
+    tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
+    tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
+
+    // Shift right by fixed_point_position
+    tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
+    tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
+
+    // Convert back to qint8 and accumulate
+    qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
+    return vqaddq_s8(a, res);
+}
+
+inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    tmp = vmlal_s8(tmp, b, c);
+
+    // Shift right by fixed_point_position
+    tmp = vshlq_s16(tmp, fixed_point_position_s16);
+
+    // Accumulate
+    return vaddq_s16(a, tmp);
+}
+
+inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    tmp = vmlal_s8(tmp, b, c);
+
+    // Shift right by fixed_point_position
+    tmp = vqshlq_s16(tmp, fixed_point_position_s16);
+
+    // Accumulate
+    return vqaddq_s16(a, tmp);
+}
+
+inline qint8x8_t vcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
+{
+    const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
+
+    float32x4x2_t res_f32 =
+    {
+        {
+            vdupq_n_f32(0.5f),
+            vdupq_n_f32(0.5f)
+        }
+    };
+
+    res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
+    res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
+
+    const int32x4x2_t res_s32 =
+    {
+        {
+            vcvtq_s32_f32(res_f32.val[0]),
+            vcvtq_s32_f32(res_f32.val[1]),
+        }
+    };
+
+    const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
+
+    return vqmovn_s16(res_s16);
+}
+
+inline qint8x16_t vcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
+{
+    const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
+
+    float32x4x4_t res_f32 =
+    {
+        {
+            vdupq_n_f32(0.5f),
+            vdupq_n_f32(0.5f),
+            vdupq_n_f32(0.5f),
+            vdupq_n_f32(0.5f)
+        }
+    };
+
+    res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
+    res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
+    res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
+    res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
+
+    const int32x4x4_t res_s32 =
+    {
+        {
+            vcvtq_s32_f32(res_f32.val[0]),
+            vcvtq_s32_f32(res_f32.val[1]),
+            vcvtq_s32_f32(res_f32.val[2]),
+            vcvtq_s32_f32(res_f32.val[3]),
+        }
+    };
+
+    const int16x8x2_t res_s16 =
+    {
+        {
+            vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
+            vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
+        }
+    };
+
+    return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
+}
+
+inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
+{
+    const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
+
+    const int16x8_t res_s16 = vmovl_s8(a);
+
+    const int32x4x2_t res_s32 =
+    {
+        {
+            vmovl_s16(vget_low_s16(res_s16)),
+            vmovl_s16(vget_high_s16(res_s16))
+        }
+    };
+
+    float32x4x2_t res_f32 =
+    {
+        {
+            vcvtq_f32_s32(res_s32.val[0]),
+            vcvtq_f32_s32(res_s32.val[1])
+        }
+    };
+
+    res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
+    res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
+
+    return res_f32;
+}
+
+inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
+
+    const int16x8x2_t res_s16 =
+    {
+        {
+            vmovl_s8(vget_low_s8(a)),
+            vmovl_s8(vget_high_s8(a)),
+        }
+    };
+
+    const int32x4x4_t res_s32 =
+    {
+        {
+            vmovl_s16(vget_low_s16(res_s16.val[0])),
+            vmovl_s16(vget_high_s16(res_s16.val[0])),
+            vmovl_s16(vget_low_s16(res_s16.val[1])),
+            vmovl_s16(vget_high_s16(res_s16.val[1])),
+        }
+    };
+
+    float32x4x4_t res_f32 =
+    {
+        {
+            vcvtq_f32_s32(res_s32.val[0]),
+            vcvtq_f32_s32(res_s32.val[1]),
+            vcvtq_f32_s32(res_s32.val[2]),
+            vcvtq_f32_s32(res_s32.val[3])
+        }
+    };
+
+    res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
+    res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
+    res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
+    res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
+
+    return res_f32;
+}
+
+inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
+{
+    // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
+    const qint8x8_t const_48_over_17       = vdup_n_s8(0x7A >> (5 - fixed_point_position));    // 2.823
+    const qint8x8_t const_minus_32_over_17 = vdup_n_s8(-(0x3C >> (5 - fixed_point_position))); // -1.8823
+    const qint8x8_t const_one              = vdup_n_s8(1 << fixed_point_position);
+
+    // Find shift value
+    const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
+    const qint8x8_t temp        = vshl_s8(a, shift_value);
+
+    qint8x8_t x = vadd_s8(const_48_over_17, vmul_qs8(temp, const_minus_32_over_17, fixed_point_position));
+
+    uint8x8_t set_one = vcgt_s8(x, const_one);
+    x                 = vbsl_s8(set_one, const_one, x);
+
+    // Use three iterations of Newton-Raphson  method to get the result
+    x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
+    x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
+    x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
+
+    return vshl_s8(x, shift_value);
+}
+
+inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
+    const qint8x16_t const_48_over_17       = vdupq_n_s8(0x7A >> (5 - fixed_point_position));   // 2.823
+    const qint8x16_t const_minus_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
+    const qint8x16_t const_one              = vdupq_n_s8(1 << fixed_point_position);
+
+    // Find shift value
+    const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
+    const qint8x16_t temp        = vshlq_s8(a, shift_value);
+
+    qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_minus_32_over_17, fixed_point_position));
+
+    // Set initial guess to one if x > 1
+    uint8x16_t set_one = vcgtq_s8(x, const_one);
+    x                  = vbslq_s8(set_one, const_one, x);
+
+    // Use three iterations of Newton-Raphson  method to get the result
+    x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
+    x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
+    x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
+
+    return vshlq_s8(x, shift_value);
+}
+
+inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
+    const qint8x16_t const_48_over_17       = vdupq_n_s8(0x7A >> (5 - fixed_point_position));   // 2.823
+    const qint8x16_t const_minus_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
+    const qint8x16_t const_one              = vdupq_n_s8(1 << fixed_point_position);
+
+    // Find shift value
+    const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
+    const qint8x16_t temp        = vqshlq_s8(a, shift_value);
+
+    qint8x16_t x = vqsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_minus_32_over_17, fixed_point_position));
+
+    // Set initial guess to one if x > 1
+    uint8x16_t set_one = vcgtq_s8(x, const_one);
+    x                  = vbslq_s8(set_one, const_one, x);
+
+    // Use three iterations of Newton-Raphson  method to get the result
+    x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
+    x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
+    x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
+
+    return vqshlq_s8(x, shift_value);
+}
+
+inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
+{
+    return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
+}
+
+inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
+{
+    return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
+}
+
+template <bool   islog>
+inline qint8x8_t vtaylor_poly_qs8(int8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
+    const qint8x8_t const_one   = vdup_n_s8(1);
+    const qint8x8_t A           = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
+    const qint8x8_t B           = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
+    const qint8x8_t C           = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
+    const qint8x8_t D           = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
+    const qint8x8_t x1          = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
+    const qint8x8_t x2          = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
+    const qint8x8_t x3          = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
+    const qint8x8_t res         = vmul_qs8(a, x3, fixed_point_position);
+    return res;
+}
+
+template <bool   islog>
+inline qint8x8_t vqtaylor_poly_qs8(int8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
+    const qint8x8_t const_one   = vdup_n_s8(1);
+    const qint8x8_t A           = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
+    const qint8x8_t B           = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
+    const qint8x8_t C           = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
+    const qint8x8_t D           = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
+    const qint8x8_t x1          = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
+    const qint8x8_t x2          = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
+    const qint8x8_t x3          = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
+    const qint8x8_t res         = vqmul_qs8(a, x3, fixed_point_position);
+    return res;
+}
+
+template <bool    islog>
+inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
+    const qint8x16_t const_one   = vdupq_n_s8(1);
+    const qint8x16_t A           = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
+    const qint8x16_t B           = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
+    const qint8x16_t C           = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
+    const qint8x16_t D           = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
+    const qint8x16_t x1          = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
+    const qint8x16_t x2          = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
+    const qint8x16_t x3          = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
+    const qint8x16_t res         = vmulq_qs8(a, x3, fixed_point_position);
+    return res;
+}
+
+template <bool    islog>
+inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
+    const qint8x16_t const_one   = vdupq_n_s8(1);
+    const qint8x16_t A           = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
+    const qint8x16_t B           = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
+    const qint8x16_t C           = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
+    const qint8x16_t D           = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
+    const qint8x16_t x1          = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
+    const qint8x16_t x2          = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
+    const qint8x16_t x3          = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
+    const qint8x16_t res         = vqmulq_qs8(a, x3, fixed_point_position);
+    return res;
+}
+
+inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t shift_value   = vdup_n_s8(fixed_point_position - 7);
+    const qint8x8_t const_one     = vdup_n_s8(1 << fixed_point_position);
+    const qint8x8_t const_ln2     = vqrshl_s8(vdup_n_s8(0x58), shift_value);                     // ln(2)
+    const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
+
+    // Perform range reduction [-log(2),log(2)]
+    const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
+
+    // get decimal part from m
+    const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
+
+    qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
+    alpha           = vqabs_qs8(vqsub_s8(a, alpha));
+
+    // Polynomial Approximation
+    qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
+    poly           = vqadd_s8(poly, const_one);
+
+    // Reconstruct
+    poly = vqshl_s8(poly, dec_m);
+
+    return poly;
+}
+
+inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t shift_value   = vdupq_n_s8(fixed_point_position - 7);
+    const qint8x16_t const_one     = vdupq_n_s8(1 << fixed_point_position);
+    const qint8x16_t const_ln2     = vqrshlq_s8(vdupq_n_s8(0x58), shift_value);                      // ln(2)
+    const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
+
+    // Perform range reduction [-log(2),log(2)]
+    const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
+
+    // get decimal part from m
+    const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
+
+    qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
+    alpha            = vqabsq_qs8(vqsubq_qs8(a, alpha));
+
+    // Polynomial Approximation
+    qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
+    poly            = vqaddq_s8(poly, const_one);
+
+    // Reconstruct
+    poly = vqshlq_s8(poly, dec_m);
+
+    return poly;
+}
+
+inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t const_one       = vdup_n_s8(1 << fixed_point_position);
+    const qint8x8_t const_seven_dec = vdup_n_s8(7);
+    const qint8x8_t const_ln2       = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
+
+    // If 0 < a < 1, calculate log(1/x)
+    uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
+    qint8x8_t recip           = vdup_n_s8(0);
+    recip                     = vbsl_s8(calc_reciprocal, recip, a);
+
+    // Calculate reciprocal
+    recip = vrecip_qs8(recip, fixed_point_position);
+    a     = vbsl_s8(calc_reciprocal, recip, a);
+
+    // Get decimal part of a
+    qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
+    qint8x8_t dec_a       = vshl_s8(a, shift_value); // a >> fixed_point_position
+
+    // Get exponent of 2^n which is equal or less than dec_a
+    shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
+
+    // Get x to range (1, 2]
+    const qint8x8_t shift_value_neg = vneg_s8(shift_value);
+    const qint8x8_t temp            = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
+    const qint8x8_t sum             = vmul_s8(shift_value, const_one);
+
+    // Polynomial Approximation
+    qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
+
+    // Reconstruct
+    poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
+
+    // Set negative value for 0 < a < 1
+    poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
+
+    return poly;
+}
+
+inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t const_one       = vdupq_n_s8(1 << fixed_point_position);
+    const qint8x16_t const_seven_dec = vdupq_n_s8(7);
+    const qint8x16_t const_ln2       = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
+
+    // If 0 < a < 1, calculate log(1/x)
+    uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
+    qint8x16_t recip           = vdupq_n_s8(0);
+    recip                      = vbslq_s8(calc_reciprocal, a, recip);
+
+    // Calculate reciprocal
+    recip = vrecipq_qs8(recip, fixed_point_position);
+    a     = vbslq_s8(calc_reciprocal, recip, a);
+
+    // Get decimal part of a
+    qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
+    qint8x16_t dec_a       = vshlq_s8(a, shift_value); // a >> fixed_point_position
+
+    // Get exponent of 2^n which is equal or less than dec_a
+    shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
+
+    // Get x to range (1, 2]
+    const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
+    const qint8x16_t temp            = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
+    const qint8x16_t sum             = vmulq_s8(shift_value, const_one);
+
+    // Polynomial Approximation
+    qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
+
+    // Reconstruct
+    poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
+
+    // Set negative value for 0 < a < 1
+    poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
+
+    return poly;
+}
+
+inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
+
+    // Find shift value. Number must be in (0.5, 2) range.
+    qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
+
+    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
+    qint8x8_t temp         = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
+    uint8x8_t temp_ltz     = vclt_s8(temp, vdup_n_qs8(0));
+    temp                   = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
+    qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
+
+    temp = vshl_s8(a, shift_value);
+
+    // Initial guess
+    qint8x8_t x = temp;
+
+    // Calculate (x / 2) * (3 - a * x^2)
+    // After three iterations we have the result for 8 bit
+    x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+
+    return vshl_s8(x, shift_value2);
+}
+
+inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
+
+    // Find shift value. Number must be in (0.5, 2) range.
+    qint8x8_t shift_value = vneg_s8(vqsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
+
+    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
+    qint8x8_t temp         = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
+    uint8x8_t temp_ltz     = vclt_s8(temp, vdup_n_qs8(0));
+    temp                   = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
+    qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
+
+    temp = vshl_s8(a, shift_value);
+
+    // Initial guess
+    qint8x8_t x = temp;
+
+    // Calculate (x / 2) * (3 - a * x^2)
+    // After three iterations we have the result for 8 bit
+    x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+
+    return vshl_s8(x, shift_value2);
+}
+
+inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
+
+    // Find shift value. Number must be in (0.5, 2) range.
+    qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
+
+    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
+    qint8x16_t temp         = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
+    uint8x16_t temp_ltz     = vcltq_s8(temp, vdupq_n_qs8(0));
+    temp                    = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
+    qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
+
+    temp = vshlq_s8(a, shift_value);
+
+    // Initial guess
+    qint8x16_t x = temp;
+
+    // Calculate (x / 2) * (3 - a * x^2)
+    // After three iterations we have the result for 8 bit
+    x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+
+    return vshlq_s8(x, shift_value2);
+}
+
+inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
+
+    // Find shift value. Number must be in (0.5, 2) range.
+    qint8x16_t shift_value = vnegq_s8(vqsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
+
+    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
+    qint8x16_t temp         = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
+    uint8x16_t temp_ltz     = vcltq_s8(temp, vdupq_n_qs8(0));
+    temp                    = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
+    qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
+
+    temp = vshlq_s8(a, shift_value);
+
+    // Initial guess
+    qint8x16_t x = temp;
+
+    // Calculate (x / 2) * (3 - a * x^2)
+    // After three iterations we have the result for 8 bit
+    x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+
+    return vshlq_s8(x, shift_value2);
+}
+
+inline qint8x8_t vtanh_qs8(qint8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
+    const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
+
+    qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
+    qint8x8_t num   = vqsub_qs8(exp2x, const_one);
+    qint8x8_t den   = vqadd_qs8(exp2x, const_one);
+    qint8x8_t tanh  = vqmul_qs8(num, vrecip_qs8(den, fixed_point_position), fixed_point_position);
+
+    return tanh;
+}
+
+inline qint8x16_t vtanhq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
+    const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
+
+    qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
+    qint8x16_t num   = vqsubq_qs8(exp2x, const_one);
+    qint8x16_t den   = vqaddq_qs8(exp2x, const_one);
+    qint8x16_t tanh  = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
+
+    return tanh;
+}
+
+inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
+{
+    return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
+}
+}

diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h
new file mode 100644
index 0000000..eaa50f1
--- /dev/null
+++ b/arm_compute/core/NEON/NEKernels.h

@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEKERNELS_H__
+#define __ARM_COMPUTE_NEKERNELS_H__
+
+/* Header regrouping all the NEON kernels */
+#include "arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h"
+#include "arm_compute/core/NEON/kernels/NEAccumulateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h"
+#include "arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h"
+#include "arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h"
+#include "arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h"
+#include "arm_compute/core/NEON/kernels/NEBox3x3Kernel.h"
+#include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h"
+#include "arm_compute/core/NEON/kernels/NEChannelCombineKernel.h"
+#include "arm_compute/core/NEON/kernels/NEChannelExtractKernel.h"
+#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
+#include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h"
+#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
+#include "arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDerivativeKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDilateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEErodeKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFastCornersKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillArrayKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h"
+#include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h"
+#include "arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h"
+#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
+#include "arm_compute/core/NEON/kernels/NEHistogramKernel.h"
+#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
+#include "arm_compute/core/NEON/kernels/NEIntegralImageKernel.h"
+#include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
+#include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
+#include "arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h"
+#include "arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h"
+#include "arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h"
+#include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
+#include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
+#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NERemapKernel.h"
+#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
+#include "arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h"
+#include "arm_compute/core/NEON/kernels/NESobel3x3Kernel.h"
+#include "arm_compute/core/NEON/kernels/NESobel5x5Kernel.h"
+#include "arm_compute/core/NEON/kernels/NESobel7x7Kernel.h"
+#include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NETableLookupKernel.h"
+#include "arm_compute/core/NEON/kernels/NEThresholdKernel.h"
+#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
+#include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
+#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
+
+#endif /* __ARM_COMPUTE_NEKERNELS_H__ */

diff --git a/arm_compute/core/NEON/NEMath.h b/arm_compute/core/NEON/NEMath.h
new file mode 100644
index 0000000..bb8a330
--- /dev/null
+++ b/arm_compute/core/NEON/NEMath.h

@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEMATH_H__
+#define __ARM_COMPUTE_NEMATH_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+/** Calculate inverse square root.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated inverse square root.
+ */
+float32x4_t vinvsqrtq_f32(float32x4_t x);
+
+/** Calculate reciprocal.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated reciprocal.
+ */
+float32x4_t vinvq_f32(float32x4_t x);
+
+/** Perform a 7th degree polynomial approximation using Estrin's method.
+ *
+ * @param[in] x      Input vector value in F32 format.
+ * @param[in] coeffs Polynomial coefficients table.
+ *
+ * @return The calculated approximation.
+ */
+float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array<float32x4_t, 8> &coeffs);
+
+/** Calculate exponential
+ *
+ * @param[in] x Input vector value in F32 format.
+ *
+ * @return The calculated exponent.
+ */
+float32x4_t vexpq_f32(float32x4_t x);
+
+/** Calculate logarithm
+ *
+ * @param[in] x Input vector value in F32 format.
+ *
+ * @return The calculated logarithm.
+ */
+float32x4_t vlogq_f32(float32x4_t x);
+
+/** Calculate hyperbolic tangent.
+ *
+ * tanh(x) = (e^2x - 1)/(e^2x + 1)
+ *
+ * @note We clamp x to [-5,5] to avoid overflowing issues.
+ *
+ * @param[in] val Input vector value in F32 format.
+ *
+ * @return The calculated Hyperbolic Tangent.
+ */
+float32x4_t vtanhq_f32(float32x4_t val);
+
+/** Calculate n power of a number.
+ *
+ * pow(x,n) = e^(n*log(x))
+ *
+ * @param[in] val Input vector value in F32 format.
+ * @param[in] n   Powers to raise the input to.
+ *
+ * @return The calculated power.
+ */
+float32x4_t vpowq_f32(float32x4_t val, float32x4_t n);
+}
+#include "arm_compute/core/NEON/NEMath.inl"
+#endif /* __ARM_COMPUTE_NEMATH_H__ */

diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl
new file mode 100644
index 0000000..a31a4c0
--- /dev/null
+++ b/arm_compute/core/NEON/NEMath.inl

@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+namespace arm_compute
+{
+/* Exponent polynomial coefficients */
+const std::array<float32x4_t, 8> exp_tab =
+{
+    {
+        vdupq_n_f32(1.f),
+        vdupq_n_f32(0.0416598916054f),
+        vdupq_n_f32(0.500000596046f),
+        vdupq_n_f32(0.0014122662833f),
+        vdupq_n_f32(1.00000011921f),
+        vdupq_n_f32(0.00833693705499f),
+        vdupq_n_f32(0.166665703058f),
+        vdupq_n_f32(0.000195780929062f),
+    }
+};
+
+/* Logarithm polynomial coefficients */
+const std::array<float32x4_t, 8> log_tab =
+{
+    {
+        vdupq_n_f32(-2.29561495781f),
+        vdupq_n_f32(-2.47071170807f),
+        vdupq_n_f32(-5.68692588806f),
+        vdupq_n_f32(-0.165253549814f),
+        vdupq_n_f32(5.17591238022f),
+        vdupq_n_f32(0.844007015228f),
+        vdupq_n_f32(4.58445882797f),
+        vdupq_n_f32(0.0141278216615f),
+    }
+};
+
+inline float32x4_t vinvsqrtq_f32(float32x4_t x)
+{
+    float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
+    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+
+    return sqrt_reciprocal;
+}
+
+inline float32x4_t vinvq_f32(float32x4_t x)
+{
+    float32x4_t recip = vrecpeq_f32(x);
+    recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
+    recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
+    return recip;
+}
+
+inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array<float32x4_t, 8> &coeffs)
+{
+    float32x4_t A   = vmlaq_f32(coeffs[0], coeffs[4], x);
+    float32x4_t B   = vmlaq_f32(coeffs[2], coeffs[6], x);
+    float32x4_t C   = vmlaq_f32(coeffs[1], coeffs[5], x);
+    float32x4_t D   = vmlaq_f32(coeffs[3], coeffs[7], x);
+    float32x4_t x2  = vmulq_f32(x, x);
+    float32x4_t x4  = vmulq_f32(x2, x2);
+    float32x4_t res = vmlaq_f32(vmlaq_f32(A, B, x2), vmlaq_f32(C, D, x2), x4);
+    return res;
+}
+
+inline float32x4_t vexpq_f32(float32x4_t x)
+{
+    static const float32x4_t CONST_LN2     = vdupq_n_f32(0.6931471805f); // ln(2)
+    static const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f); // 1/ln(2)
+
+    // Perform range reduction [-log(2),log(2)]
+    int32x4_t   m   = vcvtq_s32_f32(vmulq_f32(x, CONST_INV_LN2));
+    float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), CONST_LN2);
+
+    // Polynomial Approximation
+    float32x4_t poly = vtaylor_polyq_f32(val, exp_tab);
+
+    // Reconstruct
+    poly = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(poly), vshlq_n_s32(m, 23)));
+
+    return poly;
+}
+
+inline float32x4_t vlogq_f32(float32x4_t x)
+{
+    static const int32x4_t   CONST_127 = vdupq_n_s32(127);           // 127
+    static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
+
+    // Extract exponent
+    int32x4_t   m   = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
+    float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
+
+    // Polynomial Approximation
+    float32x4_t poly = vtaylor_polyq_f32(val, log_tab);
+
+    // Reconstruct
+    poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
+
+    return poly;
+}
+
+inline float32x4_t vtanhq_f32(float32x4_t val)
+{
+    static const float32x4_t CONST_1        = vdupq_n_f32(1.f);
+    static const float32x4_t CONST_2        = vdupq_n_f32(2.f);
+    static const float32x4_t CONST_MIN_TANH = vdupq_n_f32(-10.f);
+    static const float32x4_t CONST_MAX_TANH = vdupq_n_f32(10.f);
+
+    float32x4_t x     = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH);
+    float32x4_t exp2x = vexpq_f32(vmulq_f32(CONST_2, x));
+    float32x4_t num   = vsubq_f32(exp2x, CONST_1);
+    float32x4_t den   = vaddq_f32(exp2x, CONST_1);
+    float32x4_t tanh  = vmulq_f32(num, vinvq_f32(den));
+    return tanh;
+}
+
+inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n)
+{
+    return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
+}
+}
\ No newline at end of file

diff --git a/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h b/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h
new file mode 100644
index 0000000..9ef93ce
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h

@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEABSOLUTEDIFFERENCEKERNEL_H__
+#define __ARM_COMPUTE_NEABSOLUTEDIFFERENCEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the absolute difference kernel
+ *
+ * Absolute difference is computed by:
+ * @f[ output(x,y) = | input1(x,y) - input2(x,y) | @f]
+ */
+class NEAbsoluteDifferenceKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEAbsoluteDifferenceKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAbsoluteDifferenceKernel(const NEAbsoluteDifferenceKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAbsoluteDifferenceKernel &operator=(const NEAbsoluteDifferenceKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEAbsoluteDifferenceKernel(NEAbsoluteDifferenceKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEAbsoluteDifferenceKernel &operator=(NEAbsoluteDifferenceKernel &&) = default;
+    /** Default destructor */
+    ~NEAbsoluteDifferenceKernel() = default;
+
+    /** Set the inputs and output tensors
+     *
+     * @param[in]  input1 Source tensor. Data types supported: U8/S16
+     * @param[in]  input2 Source tensor. Data types supported: U8/S16
+     * @param[out] output Destination tensor, Data types supported: U8/S16
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the specialised absolute difference functions
+     *
+     * @param[in]  input1 An input tensor. Data types supported: U8/S16.
+     * @param[in]  input2 An input tensor. Data types supported: U8/S16.
+     * @param[out] output The output tensor, Data types supported: U8 (Only if both inputs are U8), S16.
+     * @param[in]  window Region on which to execute the kernel.
+     */
+    using AbsDiffFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
+
+    /** Absolute difference function to use for the particular tensor formats passed to configure() */
+    AbsDiffFunction *_func;
+    const ITensor   *_input1;
+    const ITensor   *_input2;
+    ITensor         *_output;
+};
+}
+#endif /* __ARM_COMPUTE_NEABSOLUTEDIFFERENCEKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEAccumulateKernel.h b/arm_compute/core/NEON/kernels/NEAccumulateKernel.h
new file mode 100644
index 0000000..df6d7b8
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEAccumulateKernel.h

@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEACCUMULATEKERNEL_H__
+#define __ARM_COMPUTE_NEACCUMULATEKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the accumulate kernel
+ *
+ * Accumulation is computed by:
+ * @f[ accum(x,y) = accum(x,y) + input(x,y) @f]
+ */
+class NEAccumulateKernel : public INESimpleKernel
+{
+public:
+    /** Set the input and accumulation tensors
+     *
+     * @param[in]  input Source tensor. Data type supported: U8.
+     * @param[out] accum Destination tensor. Data type supported: S16.
+     */
+    void configure(const ITensor *input, ITensor *accum);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+};
+
+/** Interface for the accumulate weighted kernel
+ *
+ * Weighted accumulation is computed:
+ * @f[ accum(x,y) = (1 - \alpha)*accum(x,y) + \alpha*input(x,y) @f]
+ *
+ * Where @f$ 0 \le \alpha \le 1 @f$
+ * Conceptually, the rounding for this is defined as:
+ * @f[ output(x,y)= uint8( (1 - \alpha) * float32( int32( output(x,y) ) ) + \alpha * float32( int32( input(x,y) ) ) ) @f]
+*/
+class NEAccumulateWeightedKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NEAccumulateWeightedKernel();
+    /** Set the input and accumulation tensors, and the scale value
+     *
+     * @param[in]     input Source tensor. Data type supported: U8.
+     * @param[in]     alpha Scalar value in the range [0.0f, 1.0f]
+     * @param[in,out] accum Accumulated tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input, float alpha, ITensor *accum);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+protected:
+    float _alpha;
+};
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+/** Interface for the accumulate weighted kernel using F16 */
+class NEAccumulateWeightedFP16Kernel : public NEAccumulateWeightedKernel
+{
+public:
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+};
+#else
+using NEAccumulateWeightedFP16Kernel = NEAccumulateWeightedKernel;
+#endif
+
+/** Interface for the accumulate squared kernel
+ *
+ * The accumulation of squares is computed:
+ * @f[ accum(x,y) = saturate_{int16} ( (uint16) accum(x,y) + (((uint16)(input(x,y)^2)) >> (shift)) ) @f]
+ *
+ * Where @f$ 0 \le shift \le 15 @f$
+*/
+class NEAccumulateSquaredKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NEAccumulateSquaredKernel();
+    /** Set the input and accumulation tensors and the shift value.
+     *
+     * @param[in]     input Source tensor. Data type supported: U8.
+     * @param[in]     shift Shift value in the range of [0, 15]
+     * @param[in,out] accum Accumulated tensor. Data type supported: S16.
+     */
+    void configure(const ITensor *input, uint32_t shift, ITensor *accum);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    uint32_t _shift;
+};
+}
+#endif /*__ARM_COMPUTE_NEACCUMULATEKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
new file mode 100644
index 0000000..97f92d6
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h

@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H__
+#define __ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H__
+
+#include "arm_compute/core/FixedPoint.h"
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the activation layer kernel. */
+class NEActivationLayerKernel : public INESimpleKernel
+{
+public:
+    /** Constructor */
+    NEActivationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEActivationLayerKernel(const NEActivationLayerKernel &) = delete;
+    /** Default move constructor */
+    NEActivationLayerKernel(NEActivationLayerKernel &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEActivationLayerKernel &operator=(const NEActivationLayerKernel &) = delete;
+    /** Default move assignment operator */
+    NEActivationLayerKernel &operator=(NEActivationLayerKernel &&) = default;
+    /** Set the input and output tensor.
+     *
+     * @param[in]  input           Source tensor. Data types supported: QS8/F32.
+     * @param[out] output          Destination tensor. Data type supported: same as @p input
+     * @param[in]  activation_info Activation layer information.
+     */
+    void configure(const ITensor *input, ITensor *output, ActivationLayerInfo activation_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    using ActivationFunction = ActivationLayerInfo::ActivationFunction;
+    /** Common signature for all the specialised @ref NEActivationLayerKernel functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using ActivationFunctionExecutorPtr = void (NEActivationLayerKernel::*)(const Window &window);
+    /** Function to apply an activation function on a tensor.
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    template <ActivationLayerInfo::ActivationFunction F, typename T>
+    typename std::enable_if<std::is_same<T, float>::value, void>::type activation(const Window &window);
+    /** Function to apply an activation function on a tensor.
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    template <ActivationLayerInfo::ActivationFunction F, typename T>
+    typename std::enable_if<std::is_same<T, qint8_t>::value, void>::type activation(const Window &window);
+
+private:
+    ActivationFunctionExecutorPtr _func;
+    ActivationLayerInfo           _act_info;
+};
+}
+#endif /*__ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
new file mode 100644
index 0000000..b36ca46
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h

@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEARITHMETICADDITIONKERNEL_H__
+#define __ARM_COMPUTE_NEARITHMETICADDITIONKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform addition between two tensors */
+class NEArithmeticAdditionKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEArithmeticAdditionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticAdditionKernel(const NEArithmeticAdditionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticAdditionKernel &operator=(const NEArithmeticAdditionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEArithmeticAdditionKernel(NEArithmeticAdditionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEArithmeticAdditionKernel &operator=(NEArithmeticAdditionKernel &&) = default;
+    /** Default destructor */
+    ~NEArithmeticAdditionKernel() = default;
+
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input1 An input tensor. Data types supported: U8/S16/F32
+     * @param[in]  input2 An input tensor. Data types supported: U8/S16/F32 (only if @p input1 is F32).
+     * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F32 (only if both inputs are F32).
+     * @param[in]  policy Overflow policy.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the specialised add functions
+     *
+     * @param[in]  input1 An input tensor. Data types supported: U8/S16/F32.
+     * @param[in]  input2 An input tensor. Data types supported: U8/S16/F32 (only if @p input1 is F32).
+     * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F32 (only if both inputs are F32).
+     * @param[in]  window Region on which to execute the kernel.
+     */
+    using AddFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
+    /** Add function to use for the particular tensor types passed to configure() */
+    AddFunction   *_func;
+    const ITensor *_input1;
+    const ITensor *_input2;
+    ITensor       *_output;
+};
+}
+#endif /*__ARM_COMPUTE_NEARITHMETICADDITIONKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
new file mode 100644
index 0000000..0eb9c23
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h

@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H__
+#define __ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform subtraction between two tensors */
+class NEArithmeticSubtractionKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEArithmeticSubtractionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticSubtractionKernel(const NEArithmeticSubtractionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticSubtractionKernel &operator=(const NEArithmeticSubtractionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEArithmeticSubtractionKernel(NEArithmeticSubtractionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEArithmeticSubtractionKernel &operator=(NEArithmeticSubtractionKernel &&) = default;
+    /** Default destructor */
+    ~NEArithmeticSubtractionKernel() = default;
+
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input1 An input tensor. Data types supported: U8/S16/F32
+     * @param[in]  input2 An input tensor. Data types supported: U8/S16/F32  (only if @p input1 is F32).
+     * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F32 (only if both inputs are F32).
+     * @param[in]  policy Overflow policy.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the specialised sub functions
+     *
+     * @param[in]  input1 An input tensor. Data types supported: U8, S16, F32.
+     * @param[in]  input2 An input tensor. Data types supported: U8, S16, F32 (only if @p input1 is F32).
+     * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16, F32 (only if both inputs are F32)
+     * @param[in]  window Region on which to execute the kernel.
+     */
+    using SubFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
+    /** Sub function to use for the particular tensor types passed to configure() */
+    SubFunction   *_func;
+    const ITensor *_input1;
+    const ITensor *_input2;
+    ITensor       *_output;
+};
+}
+#endif /* __ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
new file mode 100644
index 0000000..29fcbd2
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h

@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H__
+#define __ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the batch normalization layer kernel.
+ */
+class NEBatchNormalizationLayerKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEBatchNormalizationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBatchNormalizationLayerKernel(const NEBatchNormalizationLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBatchNormalizationLayerKernel &operator=(const NEBatchNormalizationLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    NEBatchNormalizationLayerKernel(NEBatchNormalizationLayerKernel &&) = default;
+    /** Default move assignment operator. */
+    NEBatchNormalizationLayerKernel &operator=(NEBatchNormalizationLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEBatchNormalizationLayerKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input   Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, FM].
+     *                     The rest are optional and used for representing batches. Data types supported: QS8/F32.
+     * @param[in]  mean    Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  var     Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  gamma   Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  beta    Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  epsilon Small value to avoid division with zero.
+     * @param[out] output  Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     */
+    void configure(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    using BatchNormFunction = void(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window);
+    BatchNormFunction *_func;
+    const ITensor     *_input;
+    ITensor           *_output;
+    const ITensor     *_mean;
+    const ITensor     *_var;
+    const ITensor     *_gamma;
+    const ITensor     *_beta;
+    float              _epsilon;
+};
+}
+#endif /*__ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h
new file mode 100644
index 0000000..b931445
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h

@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBITWISEANDKERNEL_H__
+#define __ARM_COMPUTE_NEBITWISEANDKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform bitwise AND between XY-planes of two tensors
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = input1(x,y) \land input2(x,y) @f]
+ */
+class NEBitwiseAndKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEBitwiseAndKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseAndKernel(const NEBitwiseAndKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseAndKernel &operator=(const NEBitwiseAndKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEBitwiseAndKernel(NEBitwiseAndKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEBitwiseAndKernel &operator=(NEBitwiseAndKernel &&) = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]  input1 An input tensor. Data type supported: U8.
+     * @param[in]  input2 An input tensor. Data type supported: U8
+     * @param[out] output Output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const ITensor *_input1; /**< Source tensor 1 */
+    const ITensor *_input2; /**< Source tensor 2 */
+    ITensor       *_output; /**< Destination tensor */
+};
+}
+#endif /* __ARM_COMPUTE_NEBITWISEANDKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h
new file mode 100644
index 0000000..e34eb0f
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h

@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBITWISENOTKERNEL_H__
+#define __ARM_COMPUTE_NEBITWISENOTKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform bitwise NOT operation
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = \lnot input(x,y) @f]
+ */
+class NEBitwiseNotKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEBitwiseNotKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseNotKernel(const NEBitwiseNotKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseNotKernel &operator=(const NEBitwiseNotKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEBitwiseNotKernel(NEBitwiseNotKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEBitwiseNotKernel &operator=(NEBitwiseNotKernel &&) = default;
+    /** Initialise the kernel's input and output
+     *
+     * @param[in]  input  An input tensor. Data type supported: U8.
+     * @param[out] output The output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const ITensor *_input;  /**< Source tensor */
+    ITensor       *_output; /**< Destination tensor */
+};
+}
+#endif /* __ARM_COMPUTE_NEBITWISENOTKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h
new file mode 100644
index 0000000..d2bae26
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h

@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBITWISEORKERNEL_H__
+#define __ARM_COMPUTE_NEBITWISEORKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform bitwise inclusive OR between two tensors
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = input1(x,y) \lor input2(x,y) @f]
+ */
+class NEBitwiseOrKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEBitwiseOrKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseOrKernel(const NEBitwiseOrKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseOrKernel &operator=(const NEBitwiseOrKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEBitwiseOrKernel(NEBitwiseOrKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEBitwiseOrKernel &operator=(NEBitwiseOrKernel &&) = default;
+    /** Initialise the kernel's inputs and output.
+     *
+     * @param[in]  input1 An input tensor. Data type supported: U8.
+     * @param[in]  input2 An input tensor. Data type supported: U8
+     * @param[out] output Output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const ITensor *_input1; /**< Source tensor 1 */
+    const ITensor *_input2; /**< Source tensor 2 */
+    ITensor       *_output; /**< Destination tensor */
+};
+}
+#endif /* __ARM_COMPUTE_NEBITWISEORKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h
new file mode 100644
index 0000000..9dea36e
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h

@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBITWISEXORKERNEL_H__
+#define __ARM_COMPUTE_NEBITWISEXORKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform bitwise exclusive OR (XOR) between two tensors
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = input1(x,y) \oplus input2(x,y) @f]
+ */
+class NEBitwiseXorKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEBitwiseXorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseXorKernel(const NEBitwiseXorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseXorKernel &operator=(const NEBitwiseXorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEBitwiseXorKernel(NEBitwiseXorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEBitwiseXorKernel &operator=(NEBitwiseXorKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input1 An input tensor. Data type supported: U8.
+     * @param[in]  input2 An input tensor. Data type supported: U8
+     * @param[out] output The output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const ITensor *_input1; /**< Source tensor 1 */
+    const ITensor *_input2; /**< Source tensor 2 */
+    ITensor       *_output; /**< Destination tensor */
+};
+}
+#endif /* __ARM_COMPUTE_NEBITWISEXORKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h b/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h
new file mode 100644
index 0000000..6b7bebb
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h

@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBOX3x3KERNEL_H__
+#define __ARM_COMPUTE_NEBOX3x3KERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform a Box 3x3 filter */
+class NEBox3x3Kernel : public INESimpleKernel
+{
+public:
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output           Destination tensor. Data type supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+};
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+/** NEON kernel to perform a Box 3x3 filter using F16 simd
+ */
+class NEBox3x3FP16Kernel : public NEBox3x3Kernel
+{
+public:
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+};
+#else
+using NEBox3x3FP16Kernel = NEBox3x3Kernel;
+#endif
+}
+#endif /*__ARM_COMPUTE_NEBOX3x3KERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h b/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h
new file mode 100644
index 0000000..b86085f
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h

@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECANNYEDGEKERNEL_H__
+#define __ARM_COMPUTE_NECANNYEDGEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Computes magnitude and quantised phase from inputs gradients. */
+class NEGradientKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEGradientKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGradientKernel(const NEGradientKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGradientKernel &operator=(const NEGradientKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGradientKernel(NEGradientKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGradientKernel &operator=(NEGradientKernel &&) = default;
+    /** Default destructor */
+    virtual ~NEGradientKernel() = default;
+
+    /** Initialise the kernel's sources, destinations and border mode.
+     *
+     * @note gx, gy and magnitude must all be the same size (either 16 or 32)
+     *
+     * @param[in]  gx        Source tensor - Gx component. Data type supported: S16/S32.
+     * @param[in]  gy        Source tensor - Gy component. Data type supported: same as @p gx.
+     * @param[out] magnitude Destination tensor - Magnitude. Data type supported: U16 (if the data type of @p gx is S16) / U32 (if the data type of @p gx is S32).
+     * @param[out] phase     Destination tensor - Quantized phase. Data type supported: U8.
+     * @param[in]  norm_type Normalization type. If 1, L1-Norm otherwise L2-Norm
+     */
+    virtual void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+protected:
+    /** Common signature for all the specialised gradient functions
+     *
+     * @param[in]  gx_ptr        Pointer to the first input tensor.
+     * @param[in]  gy_ptr        Pointer to the second input tensor.
+     * @param[out] magnitude_ptr Pointer to the first output tensor
+     * @param[out] phase_ptr     Pointer to the second output tensor
+     */
+    using GradientFunction = void(const void *__restrict gx_ptr, const void *__restrict gy_ptr, void *__restrict magnitude_ptr, void *__restrict phase_ptr);
+
+    GradientFunction *_func;      /**< Gradient function to use for the particular tensor types passed to configure() */
+    const ITensor    *_gx;        /**< Source tensor - Gx component */
+    const ITensor    *_gy;        /**< Source tensor - Gy component */
+    ITensor          *_magnitude; /**< Destination tensor - Magnitude */
+    ITensor          *_phase;     /**< Destination tensor - Quantized phase */
+};
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+/** NEON kernel to perform Gradient computation
+ */
+class NEGradientFP16Kernel : public NEGradientKernel
+{
+public:
+    // Inherited methods overriden:
+    void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type) override;
+};
+#else  /* ARM_COMPUTE_ENABLE_FP16 */
+using NEGradientFP16Kernel = NEGradientKernel;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
+/** NEON kernel to perform Non-Maxima suppression for Canny Edge.
+ *
+ * @note This kernel is meant to be used alongside CannyEdge and performs a non-maxima suppression using magnitude and phase of input
+ *       to characterize points as possible edges. Thus, at the end, each point will be set to EDGE, NO_EDGE or MAYBE.
+ *
+ * @note Hysteresis is computed in @ref NEEdgeTraceKernel
+ */
+class NEEdgeNonMaxSuppressionKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEEdgeNonMaxSuppressionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEEdgeNonMaxSuppressionKernel(const NEEdgeNonMaxSuppressionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEEdgeNonMaxSuppressionKernel &operator=(const NEEdgeNonMaxSuppressionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEEdgeNonMaxSuppressionKernel(NEEdgeNonMaxSuppressionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEEdgeNonMaxSuppressionKernel &operator=(NEEdgeNonMaxSuppressionKernel &&) = default;
+    /** Default destructor */
+    ~NEEdgeNonMaxSuppressionKernel() = default;
+
+    /** Initialise the kernel's sources, destination and border mode.
+     *
+     * @param[in]  magnitude        Source tensor - Magnitude. Data type supported: U16/U32.
+     * @param[in]  phase            Source tensor - Quantized phase. Data type supported: U8.
+     * @param[out] output           Output tensor. Data type supported: U8. It will be filled with 0 for "no edge", 127 for "maybe", 255 for "edge"
+     * @param[in]  upper_thr        Upper threshold used for the hysteresis
+     * @param[in]  lower_thr        Lower threshold used for the hysteresis
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *magnitude, const ITensor *phase, ITensor *output, int32_t upper_thr, int32_t lower_thr, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Common signature for all the specialised non-maxima suppression functions
+     *
+     * @param[in]  magnitude_ptr Pointer to the first input tensor.
+     * @param[in]  phase_ptr     Pointer to the second input tensor.
+     * @param[out] output_ptr    Pointer to the output tensor
+     * @param[in]  stride_mag    Stride of the magnitude tensor
+     * @param[in]  upper_thr     Upper threshold used for the hysteresis
+     * @param[in]  lower_thr     Lower threshold used for the hysteresis
+     */
+    using EdgeNonMaxSupprFunction = void(const void *__restrict magnitude_ptr, const void *__restrict phase_ptr, void *__restrict output_ptr, const uint32_t stride_mag, const int32_t upper_thr,
+                                         const int32_t lower_thr);
+
+    EdgeNonMaxSupprFunction *_func;      /**< Non-Maxima suppression function to use for the particular tensor types passed to configure() */
+    const ITensor           *_magnitude; /**< Source tensor - Magnitude */
+    const ITensor           *_phase;     /**< Source tensor - Quantized phase */
+    ITensor                 *_output;    /**< Destination tensor */
+    int32_t                  _lower_thr; /**< Lower threshold used for the hysteresis */
+    int32_t                  _upper_thr; /**< Upper threshold used for the hysteresis */
+};
+
+/** NEON kernel to perform Edge tracing */
+class NEEdgeTraceKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEEdgeTraceKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEEdgeTraceKernel(const NEEdgeTraceKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEEdgeTraceKernel &operator=(const NEEdgeTraceKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEEdgeTraceKernel(NEEdgeTraceKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEEdgeTraceKernel &operator=(NEEdgeTraceKernel &&) = default;
+    /** Default constructor */
+    ~NEEdgeTraceKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in,out] input  Source tensor. Data type supported: U8. Must contain 0 for "no edge", 127 for "maybe", 255 for "edge"
+     * @param[in,out] output Destination tensor. Data type supported: U8. Must be initialized to 0 (No edge).
+     */
+    void configure(ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+    bool       is_parallelisable() const override;
+
+private:
+    ITensor *_input;  /**< Source tensor */
+    ITensor *_output; /**< Destination tensor */
+};
+}
+#endif /* __ARM_COMPUTE_NECANNYEDGEKERNEL_H */

diff --git a/arm_compute/core/NEON/kernels/NEChannelCombineKernel.h b/arm_compute/core/NEON/kernels/NEChannelCombineKernel.h
new file mode 100644
index 0000000..8b669a4
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEChannelCombineKernel.h

@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECHANNELCOMBINEKERNEL_H__
+#define __ARM_COMPUTE_NECHANNELCOMBINEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <array>
+#include <cstdint>
+
+namespace arm_compute
+{
+class IMultiImage;
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the channel combine kernel */
+class NEChannelCombineKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEChannelCombineKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEChannelCombineKernel(const NEChannelCombineKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEChannelCombineKernel &operator=(const NEChannelCombineKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEChannelCombineKernel(NEChannelCombineKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEChannelCombineKernel &operator=(NEChannelCombineKernel &&) = default;
+    /** Default destructor */
+    ~NEChannelCombineKernel() = default;
+
+    /** Configure function's inputs and outputs.
+     *
+     * @param[in]  plane0 The 2D plane that forms channel 0. Data type supported: U8
+     * @param[in]  plane1 The 2D plane that forms channel 1. Data type supported: U8
+     * @param[in]  plane2 The 2D plane that forms channel 2. Data type supported: U8
+     * @param[in]  plane3 The 2D plane that forms channel 3. Data type supported: U8
+     * @param[out] output The single planar output tensor. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
+     */
+    void configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output);
+    /** Configure function's inputs and outputs.
+     *
+     * @param[in]  plane0 The 2D plane that forms channel 0. Data type supported: U8
+     * @param[in]  plane1 The 2D plane that forms channel 1. Data type supported: U8
+     * @param[in]  plane2 The 2D plane that forms channel 2. Data type supported: U8
+     * @param[out] output The multi planar output tensor. Formats supported: NV12/NV21/IYUV/YUV444
+     */
+    void configure(const IImage *plane0, const IImage *plane1, const IImage *plane2, IMultiImage *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    bool is_parallelisable() const override;
+
+private:
+    /** Combine 3 planes to form a three channel single plane tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void combine_3C(const Window &win);
+    /** Combine 4 planes to form a four channel single plane tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void combine_4C(const Window &win);
+    /** Combine 3 planes to form a single plane YUV tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    template <bool is_yuyv>
+    void combine_YUV_1p(const Window &win);
+    /** Combine 3 planes to form a two plane YUV tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void combine_YUV_2p(const Window &win);
+    /** Combine 3 planes to form a three plane YUV tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void combine_YUV_3p(const Window &win);
+    /** Copies a full plane to the output tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void copy_plane(const Window &win, uint32_t plane_id);
+    /** Common signature for all the specialised ChannelCombine functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using ChannelCombineFunction = void (NEChannelCombineKernel::*)(const Window &window);
+    /** ChannelCombine function to use for the particular tensor types passed to configure() */
+    ChannelCombineFunction _func;
+    std::array<const ITensor *, 4> _planes;
+    ITensor     *_output;
+    IMultiImage *_output_multi;
+    std::array<uint32_t, 3> _x_subsampling;
+    std::array<uint32_t, 3> _y_subsampling;
+    unsigned int _num_elems_processed_per_iteration;
+    bool         _is_parallelizable;
+};
+}
+#endif /* __ARM_COMPUTE_NECHANNELCOMBINEKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h b/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h
new file mode 100644
index 0000000..0715e1f
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h

@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECHANNELEXTRACTKERNEL_H__
+#define __ARM_COMPUTE_NECHANNELEXTRACTKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class IMultiImage;
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the channel extract kernel */
+class NEChannelExtractKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NEChannelExtractKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEChannelExtractKernel(const NEChannelExtractKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEChannelExtractKernel &operator=(const NEChannelExtractKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEChannelExtractKernel(NEChannelExtractKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEChannelExtractKernel &operator=(NEChannelExtractKernel &&) = default;
+    /** Default destructor */
+    ~NEChannelExtractKernel() = default;
+
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input   Source tensor. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422
+     * @param[in]  channel Channel to extract.
+     * @param[out] output  Destination tensor. Format supported: u8
+     */
+    void configure(const ITensor *input, Channel channel, ITensor *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input   Multi-planar source image. Formats supported: NV12/NV21/IYUV/YUV444
+     * @param[in]  channel Channel to extract.
+     * @param[out] output  Single-planar destination image. Format supported: U8
+     */
+    void configure(const IMultiImage *input, Channel channel, IImage *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Extract one channel from a two channel planar tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void extract_1C_from_2C_img(const Window &win);
+    /** Extract one channel from a three channel planar tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void extract_1C_from_3C_img(const Window &win);
+    /** Extract one channel from a four channel planar tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void extract_1C_from_4C_img(const Window &win);
+    /** Extract U/V channel from a single planar YUVY/UYVY tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void extract_YUYV_uv(const Window &win);
+    /** Copies a full plane to the output tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void copy_plane(const Window &win);
+    /** Common signature for all the specialised ChannelExtract functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using ChannelExtractFunction = void (NEChannelExtractKernel::*)(const Window &window);
+    /** ChannelExtract function to use for the particular tensor types passed to configure() */
+    ChannelExtractFunction _func;
+    unsigned int           _lut_index;
+};
+}
+#endif /* __ARM_COMPUTE_NECHANNELEXTRACTKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NECol2ImKernel.h b/arm_compute/core/NEON/kernels/NECol2ImKernel.h
new file mode 100644
index 0000000..f6bc215
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NECol2ImKernel.h

@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECOL2IMKERNEL_H__
+#define __ARM_COMPUTE_NECOL2IMKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform col2im reshaping.
+ *
+ * Rearranges each matrix column into image blocks. It's the inverse operation of @ref NEIm2ColKernel.
+ *
+ * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3:
+ *
+ * @f[
+ * \left( \begin{array}{ccccccccc}
+ * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccc}
+ * a0 & a1 & a2 \\
+ * a3 & a4 & a5 \\
+ * a6 & a7 & a8 \\
+ * \end{array} \right)
+ * @f]
+ */
+class NECol2ImKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NECol2ImKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECol2ImKernel(const NECol2ImKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECol2ImKernel &operator=(const NECol2ImKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NECol2ImKernel(NECol2ImKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NECol2ImKernel &operator=(NECol2ImKernel &&) = default;
+    /** Default destructor */
+    ~NECol2ImKernel() = default;
+
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input          The input tensor to convert. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[out] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
+     *                            while the rest represent batch of outputs. Data types supported: Same as @p input
+     * @param[in]  convolved_dims Output convolved dimensions.
+     */
+    void configure(const ITensor *input, ITensor *output, std::pair<unsigned int, unsigned int> convolved_dims);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Template function to run the col2im
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename T>
+    void run_col2im(const Window &window);
+
+    /** Common signature for all the specialised col2im functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using Col2ImFunctionPtr = void (NECol2ImKernel::*)(const Window &window);
+
+    Col2ImFunctionPtr _func;
+    const ITensor    *_input;
+    ITensor          *_output;
+    std::pair<unsigned int, unsigned int> _convolved_dims;
+};
+}
+
+#endif /*__ARM_COMPUTE_NECOL2IMKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEColorConvertKernel.h b/arm_compute/core/NEON/kernels/NEColorConvertKernel.h
new file mode 100644
index 0000000..2297218
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEColorConvertKernel.h

@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_COLORCONVERTKERNEL_H__
+#define __ARM_COMPUTE_COLORCONVERTKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class IMultiImage;
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the color convert kernel */
+class NEColorConvertKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEColorConvertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEColorConvertKernel(const NEColorConvertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEColorConvertKernel &operator=(const NEColorConvertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEColorConvertKernel(NEColorConvertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEColorConvertKernel &operator=(NEColorConvertKernel &&) = default;
+    /** Default destructor */
+    ~NEColorConvertKernel() = default;
+
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888
+     * @param[out] output Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422),
+     *                                                          RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/)
+     */
+    void configure(const ITensor *input, ITensor *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
+     * @param[out] output Single-planar destination image. Formats supported: RGB888/RGBA8888
+     */
+    void configure(const IMultiImage *input, IImage *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
+     * @param[out] output Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888)
+     */
+    void configure(const IImage *input, IMultiImage *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
+     * @param[out] output Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of  @p input is IYUV)
+     */
+    void configure(const IMultiImage *input, IMultiImage *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    using ColorConvertFunction = void(const void *__restrict input_ptr, void *__restrict output_ptr, const Window &win);
+    const void           *_input;
+    void                 *_output;
+    ColorConvertFunction *_func;
+};
+}
+#endif /*__ARM_COMPUTE_NECOLORCONVERTKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEConvolutionKernel.h b/arm_compute/core/NEON/kernels/NEConvolutionKernel.h
new file mode 100644
index 0000000..588a228
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEConvolutionKernel.h

@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECONVOLUTIONKERNEL_H__
+#define __ARM_COMPUTE_NECONVOLUTIONKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+#include <array>
+#include <cstdint>
+#include <vector>
+
+namespace arm_compute
+{
+class ITensor;
+
+/****************************************************************************************\
+ *                                    Square Convolution                                *
+\****************************************************************************************/
+
+/** Interface for the kernel to run an arbitrary size convolution on a tensor. (Currently supports 3x3, 5x5, 7x7 and 9x9).
+ * The client can supply a convolution matrix \f$ C_{m,n} \f$.
+ * @f{eqnarray}{
+ *  k_0 &=& \frac{m}{2}  \\
+ *  l_0 &=& \frac{n}{2}  \\
+ *  sum &=& \sum_{k=0,l=0}^{k=m-1,l=n-1} input(x+k-k_0, y+l-l_0) C_{k,l}
+ *  @f}
+ *
+ * @note The above equation for this function is similar to the default OpenCV Filter2D function,
+ *       which actually computes a correlation and not a convolution.
+ *       In case of a real convolution the convolution matrix should be flipped both horizontally and vertically.
+ */
+template <unsigned int matrix_size>
+class NEConvolutionKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NEConvolutionKernel();
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output           Destination tensor. Data types supported: U8, S16.
+     * @param[in]  conv             Convolution matrix to apply to the input tensor.
+     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    template <typename OutputType>
+    void convolution(const Window &win);
+
+protected:
+    uint32_t _scale;                                             /**< scale of the convolution */
+    std::array<int16_t, matrix_size *matrix_size> _convolution;  /**< convolution matrix */
+};
+
+/** Interface for the kernel which applied a 3x3 convolution to a tensor.*/
+using NEConvolution3x3Kernel = NEConvolutionKernel<3>;
+/** Interface for the kernel which applied a 5x5 convolution to a tensor.*/
+using NEConvolution5x5Kernel = NEConvolutionKernel<5>;
+/** Interface for the kernel which applied a 7x7 convolution to a tensor.*/
+using NEConvolution7x7Kernel = NEConvolutionKernel<7>;
+///** Interface for the kernel which applied a 9x9 convolution to a tensor.*/
+using NEConvolution9x9Kernel = NEConvolutionKernel<9>;
+
+/****************************************************************************************\
+ *                              Separable Square Convolution                            *
+\****************************************************************************************/
+
+/** Kernel for the Horizontal pass of a Separable Convolution */
+template <unsigned int matrix_size>
+class NESeparableConvolutionHorKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NESeparableConvolutionHorKernel();
+
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output           Destination tensor. Data types supported: U16, S16, S32.
+     * @param[in]  conv_row         Convolution matrix to apply to the input tensor.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, const int16_t *conv_row, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Apply the object's convolution to the given window of the input tensor..
+     *
+     * @param[in] window Window to apply the convolution on.
+     */
+    template <typename OutputType>
+    void convolve(const Window &window);
+
+    std::array<int16_t, matrix_size> _conv_row; /**< Convolution coefficients */
+    BorderSize _border_size;                    /**< Border size */
+};
+
+/** Interface for the kernel which applied a 5x1 horizontal convolution to a tensor.*/
+using NESeparableConvolution5x5HorKernel = NESeparableConvolutionHorKernel<5>;
+/** Interface for the kernel which applied a 7x1 horizontal convolution to a tensor.*/
+using NESeparableConvolution7x7HorKernel = NESeparableConvolutionHorKernel<7>;
+/** Interface for the kernel which applied a 9x1 horizontal convolution to a tensor.*/
+using NESeparableConvolution9x9HorKernel = NESeparableConvolutionHorKernel<9>;
+
+/** Kernel for the Vertical pass of a Separable Convolution */
+template <unsigned int matrix_size>
+class NESeparableConvolutionVertKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NESeparableConvolutionVertKernel();
+
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U16, S16, S32.
+     * @param[out] output           Destination tensor, Data types supported: U8, S16.
+     * @param[in]  conv_col         Convolution matrix to apply to the input tensor.
+     * @param[in]  scale            Scale of the convolution matrix
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, const int16_t *conv_col, uint32_t scale, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Apply the object's convolution to the given window of the input tensor.
+     *  This function is used if the intermediate values have been stored as U16.
+     *
+     * @param[in] win Window to apply the convolution on.
+     */
+    template <typename OutputType>
+    void convolution_u16(const Window &win);
+    /** Apply the object's convolution to the given window of the input tensor.
+     *  This function is used if the intermediate values have been stored as S16.
+     *
+     * @param[in] win Window to apply the convolution on.
+     */
+    template <typename OutputType>
+    void convolution_s16(const Window &win);
+    /** Apply the object's convolution to the given window of the input tensor.
+     *  This function is used if the intermediate values have been stored as S32.
+     *
+     * @param[in] win Window to apply the convolution on.
+     */
+    template <typename OutputType>
+    void convolution_s32(const Window &win);
+
+    std::array<int16_t, matrix_size> _conv_col; /**< Convolution coefficients */
+    uint32_t _scale;                            /**< Convolution's scale */
+};
+
+/** Interface for the kernel which applied a 1x5 vertical convolution to a tensor.*/
+using NESeparableConvolution5x5VertKernel = NESeparableConvolutionVertKernel<5>;
+/** Interface for the kernel which applied a 1x7 vertical convolution to a tensor.*/
+using NESeparableConvolution7x7VertKernel = NESeparableConvolutionVertKernel<7>;
+/** Interface for the kernel which applied a 1x9 vertical convolution to a tensor.*/
+using NESeparableConvolution9x9VertKernel = NESeparableConvolutionVertKernel<9>;
+
+/****************************************************************************************\
+ *                                 Rectangle Convolution                                *
+\****************************************************************************************/
+
+/** Kernel for the running convolution on a rectangle matrix.
+ *
+ * @note Supports combinations of 3,5,7 and 9.
+ */
+class NEConvolutionRectangleKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEConvolutionRectangleKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConvolutionRectangleKernel(NEConvolutionRectangleKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConvolutionRectangleKernel &operator=(NEConvolutionRectangleKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEConvolutionRectangleKernel(NEConvolutionRectangleKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEConvolutionRectangleKernel &operator=(NEConvolutionRectangleKernel &&) = default;
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output           Destination tensor, Data types supported: U8, S16.
+     * @param[in]  conv             Convolution matrix to apply to the input tensor.
+     * @param[in]  width            Width of convolution matrix (Number of columns)
+     * @param[in]  height           Height of convolution matrix (Number of rows)
+     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    unsigned int get_index(uint32_t val);
+    /** Apply the object's convolution to the given window of the input tensor.
+     *
+     * @param[in] win Window to apply the convolution on.
+     */
+    template <typename OutputType, unsigned int rows, unsigned int cols>
+    void convolution(const Window &win);
+
+protected:
+    const ITensor            *_input;       /**< Input tensor */
+    ITensor                  *_output;      /**< Output tensor */
+    uint32_t                  _scale;       /**< Scale of the convolution */
+    std::vector<int16_t>      _convolution; /**< Convolution matrix */
+    BorderSize                _border_size; /**< Calculated border width */
+    uint32_t                  _func_idx;    /**< Index used to specify convolution function to be used */
+    const static unsigned int _nr_supported_sizes
+    {
+        4
+    }; /**< Number of supported permutations */
+};
+}
+#endif /*__ARM_COMPUTE_NECONVOLUTIONKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h b/arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h
new file mode 100644
index 0000000..67b8c60
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h

@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECUMULATIVEDISTRIBUTIONKERNEL_H__
+#define __ARM_COMPUTE_NECUMULATIVEDISTRIBUTIONKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class IDistribution1D;
+class ILut;
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the cumulative distribution (cummulative summmation) calculation kernel.
+ *
+ * This kernel calculates the cumulative sum of a given distribution (meaning that each output element
+ * is the sum of all its previous elements including itself) and creates a lookup table with the normalized
+ * pixel intensities which is used for improve the constrast of the image.
+ */
+class NECumulativeDistributionKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NECumulativeDistributionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECumulativeDistributionKernel(const NECumulativeDistributionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECumulativeDistributionKernel &operator=(const NECumulativeDistributionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NECumulativeDistributionKernel(NECumulativeDistributionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NECumulativeDistributionKernel &operator=(NECumulativeDistributionKernel &&) = default;
+    /** Set the input and output distribution.
+     *
+     * @param[in]  input          Input image. Data type supported: U8
+     * @param[in]  distribution   Unnormalized 256-bin distribution of the input image.
+     * @param[out] cumulative_sum Cummulative distribution (Summed histogram). Should be same size as @p distribution.
+     * @param[out] output         Equalization lookup table. Should consist of 256 entries of U8 elements.
+     */
+    void configure(const IImage *input, const IDistribution1D *distribution, IDistribution1D *cumulative_sum, ILut *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    bool is_parallelisable() const override;
+
+private:
+    const IImage          *_input;          /**< Input image. */
+    const IDistribution1D *_distribution;   /**< Input histogram of the input image. */
+    IDistribution1D       *_cumulative_sum; /**< The cummulative distribution. */
+    ILut                  *_output;         /**< Output with the equalization lookup table. */
+private:
+    static const uint32_t _histogram_size = 256; /**< Default histogram size of 256. */
+};
+}
+
+#endif /*__ARM_COMPUTE_NECUMULATIVEDISTRIBUTIONKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h b/arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h
new file mode 100644
index 0000000..7384cd1
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h

@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H__
+#define __ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the depth concatenate kernel.
+ *  The input tensor will be concatenated into the output tensor.
+ */
+class NEDepthConcatenateKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEDepthConcatenateKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthConcatenateKernel(const NEDepthConcatenateKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthConcatenateKernel &operator=(const NEDepthConcatenateKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEDepthConcatenateKernel(NEDepthConcatenateKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEDepthConcatenateKernel &operator=(NEDepthConcatenateKernel &&) = default;
+    /** Default destructor */
+    ~NEDepthConcatenateKernel() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]     input        Input tensor. Data types supported: F32.
+     * @param[in]     depth_offset The offset on the Z axis.
+     * @param[in,out] output       Output tensor. Data types supported: F32.
+     *
+     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
+     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
+     *
+     */
+    void configure(const ITensor *input, unsigned int depth_offset, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    const ITensor *_input;
+    ITensor       *_output;
+    int            _top_bottom;
+    int            _left_right;
+    unsigned int   _depth_offset;
+};
+}
+#endif /* __ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEDepthConvertKernel.h b/arm_compute/core/NEON/kernels/NEDepthConvertKernel.h
new file mode 100644
index 0000000..0c5c29e
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEDepthConvertKernel.h

@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_DEPTHCONVERTKERNEL_H__
+#define __ARM_COMPUTE_DEPTHCONVERTKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Depth conversion kernel */
+class NEDepthConvertKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor*/
+    NEDepthConvertKernel();
+    /** Set the input and output of the kernel
+     *
+     * Valid conversions Input -> Output :
+     *
+     *   - QS8 -> F32
+     *   - U8 -> U16, S16, S32
+     *   - U16 -> U8, U32
+     *   - S16 -> U8, S32
+     *   - F32 -> QS8
+     *
+     *
+     * @param[in]  input  The input tensor to convert. Data types supported: U8/QS8/U16/S16/F32.
+     * @param[out] output The output tensor. Data types supported: U8/QS8/U16/S16/U32/S32/F32.
+     * @param[in]  policy Conversion policy.
+     * @param[in]  shift  Value for down/up conversions. Must be 0 <= shift < 8.
+     */
+    void configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    ConvertPolicy _policy;
+    uint32_t      _shift;
+};
+}
+#endif /*__ARM_COMPUTE_NEDEPTHCONVERTKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEDerivativeKernel.h b/arm_compute/core/NEON/kernels/NEDerivativeKernel.h
new file mode 100644
index 0000000..abb8a89
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEDerivativeKernel.h

@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDERIVATIVEKERNEL_H__
+#define __ARM_COMPUTE_NEDERIVATIVEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to run the derivative along the X/Y directions on a tensor.
+ *
+ */
+class NEDerivativeKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEDerivativeKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDerivativeKernel(const NEDerivativeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDerivativeKernel &operator=(const NEDerivativeKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEDerivativeKernel(NEDerivativeKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEDerivativeKernel &operator=(NEDerivativeKernel &&) = default;
+    /** Initialise the kernel's sources, destination and border
+     *
+     * @note At least one of output_x or output_y must be set
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Function to perform derivative along the X direction on the given window
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    void derivative_x(const Window &window);
+    /** Function to perform derivative along the Y direction on the given window
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    void derivative_y(const Window &window);
+    /** Function to perform derivative along the X and Y direction on the given window
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    void derivative_xy(const Window &window);
+    /** Common signature for all the specialised derivative functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using DerivativeFunction = void (NEDerivativeKernel::*)(const Window &window);
+    /** Derivative function to use for the particular tensor types passed to configure() */
+    DerivativeFunction _func;
+
+private:
+    const ITensor *_input;    /**< Input tensor */
+    ITensor       *_output_x; /**< Output tensor - Derivate along the X direction */
+    ITensor       *_output_y; /**< Output tensor - Derivate along the Y direction */
+};
+}
+#endif /* __ARM_COMPUTE_NEDERIVATIVEKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEDilateKernel.h b/arm_compute/core/NEON/kernels/NEDilateKernel.h
new file mode 100644
index 0000000..05f148a
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEDilateKernel.h

@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDILATEKERNEL_H__
+#define __ARM_COMPUTE_NEDILATEKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform boolean image dilatation */
+class NEDilateKernel : public INESimpleKernel
+{
+public:
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8
+     * @param[out] output           Destination tensor. Data type supported: U8
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+};
+}
+#endif /*__ARM_COMPUTE_NEDILATEKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h
new file mode 100644
index 0000000..f098e18
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h

@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERBIASACCUMULATEKERNEL_H__
+#define __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERBIASACCUMULATEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+/** NEON kernel to accumulate the biases to each element of the input tensor
+ *
+ * @note We assume bias to be shared
+ */
+class NEDirectConvolutionLayerBiasAccumulateKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEDirectConvolutionLayerBiasAccumulateKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDirectConvolutionLayerBiasAccumulateKernel(const NEDirectConvolutionLayerBiasAccumulateKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDirectConvolutionLayerBiasAccumulateKernel &operator=(const NEDirectConvolutionLayerBiasAccumulateKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEDirectConvolutionLayerBiasAccumulateKernel(NEDirectConvolutionLayerBiasAccumulateKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEDirectConvolutionLayerBiasAccumulateKernel &operator=(NEDirectConvolutionLayerBiasAccumulateKernel &&) = default;
+    /** Default destructor */
+    ~NEDirectConvolutionLayerBiasAccumulateKernel() = default;
+    /** Set the accumulate buffer and the biases of the kernel.
+     *
+     * @param[in, out] input  Input to add the bias to. If @p output is not specified then accumulation is done in-place.
+     *                        Data type supported: QS8/F32
+     * @param[in]      bias   The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
+     * @param[out]     output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
+     *                         Data type supported: Same as @p input
+     */
+    void configure(ITensor *input, const ITensor *bias, ITensor *output = nullptr);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    using BiasAccumulateKernel = void(ITensor *input, const ITensor *bias, const Window window, ITensor *output);
+
+private:
+    BiasAccumulateKernel *_func;
+    ITensor              *_input;
+    const ITensor        *_bias;
+    ITensor              *_output;
+};
+}
+#endif /*__ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERBIASACCUMULATEKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
new file mode 100644
index 0000000..d726071
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h

@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H__
+#define __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON interface for Direct Convolution Layer kernel */
+class NEDirectConvolutionLayerKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEDirectConvolutionLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDirectConvolutionLayerKernel(const NEDirectConvolutionLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDirectConvolutionLayerKernel &operator=(const NEDirectConvolutionLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEDirectConvolutionLayerKernel(NEDirectConvolutionLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEDirectConvolutionLayerKernel &operator=(NEDirectConvolutionLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEDirectConvolutionLayerKernel() = default;
+    /** Set the input, weights and output tensors.
+      *
+      * @param[in]  input     Input tensor. Data types supported: QS8/F32.
+      * @param[in]  weights   Set of kernels to convolve the input volume.
+      *                       The 3rd dimension must be the same as the input's volume 3rd dimension.
+      *                       Data type supported: Same as @p input.
+      * @param[out] output    Output tensor.
+      *                       The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
+      * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+      */
+    void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    const ITensor *_input;
+    const ITensor *_weights;
+    ITensor       *_output;
+    PadStrideInfo  _conv_info;
+    BorderSize     _border_size;
+    unsigned int   _kernel_size;
+    unsigned int   _num_elems_read_per_iteration;
+    unsigned int   _num_elems_written_per_iteration;
+};
+}
+#endif /*__ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEErodeKernel.h b/arm_compute/core/NEON/kernels/NEErodeKernel.h
new file mode 100644
index 0000000..86dc217
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEErodeKernel.h

@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEERODEKERNEL_H__
+#define __ARM_COMPUTE_NEERODEKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform boolean image erosion */
+class NEErodeKernel : public INESimpleKernel
+{
+public:
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8
+     * @param[out] output           Destination tensor. Data type supported: U8
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+};
+}
+#endif /*__ARM_COMPUTE_NEERODEKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEFastCornersKernel.h b/arm_compute/core/NEON/kernels/NEFastCornersKernel.h
new file mode 100644
index 0000000..d9bd6ac
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEFastCornersKernel.h

@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFASTCORNERSKERNEL_H__
+#define __ARM_COMPUTE_NEFASTCORNERSKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** NEON kernel to perform fast corners */
+class NEFastCornersKernel : public INEKernel
+{
+public:
+    /** Constructor */
+    NEFastCornersKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFastCornersKernel(const NEFastCornersKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFastCornersKernel &operator=(const NEFastCornersKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEFastCornersKernel(NEFastCornersKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEFastCornersKernel &operator=(NEFastCornersKernel &&) = default;
+    /** Initialise the kernel.
+     *
+     * @param[in]  input               Source image. Data type supported: U8.
+     * @param[out] output              Output image. Data type supported: U8.
+     * @param[in]  threshold           Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
+     * @param[in]  non_max_suppression True if non-maxima suppresion is applied, false otherwise.
+     * @param[in]  border_undefined    True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const IImage *input, IImage *output, uint8_t threshold, bool non_max_suppression, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    const IImage *_input;               /**< source image */
+    IImage       *_output;              /**< inermediate results */
+    uint8_t       _threshold;           /**< threshold on difference between intensity */
+    bool          _non_max_suppression; /** true if non-maxima suppression is applied in the next stage */
+};
+}
+#endif

diff --git a/arm_compute/core/NEON/kernels/NEFillArrayKernel.h b/arm_compute/core/NEON/kernels/NEFillArrayKernel.h
new file mode 100644
index 0000000..8e0846e
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEFillArrayKernel.h

@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFILLARRAYKERNEL_H__
+#define __ARM_COMPUTE_NEFILLARRAYKERNEL_H__
+
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** This kernel adds all texels greater than or equal to the threshold value to the keypoint array. */
+class NEFillArrayKernel : public INEKernel
+{
+public:
+    /** Default contructor */
+    NEFillArrayKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFillArrayKernel(const NEFillArrayKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFillArrayKernel &operator=(const NEFillArrayKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEFillArrayKernel(NEFillArrayKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEFillArrayKernel &operator=(NEFillArrayKernel &&) = default;
+    /** Default detructor */
+    ~NEFillArrayKernel() = default;
+
+    /** Initialise the kernel.
+     *
+     * @param[in]  input     Source image. Data type supported: U8.
+     * @param[in]  threshold Texels greater than the threshold will be added to the array.
+     * @param[out] output    Arrays of keypoints to store the results.
+     */
+    void configure(const IImage *input, uint8_t threshold, IKeyPointArray *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    bool is_parallelisable() const override;
+
+private:
+    const IImage   *_input;
+    IKeyPointArray *_output;
+    uint8_t         _threshold;
+};
+}
+#endif

diff --git a/arm_compute/core/NEON/kernels/NEFillBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillBorderKernel.h
new file mode 100644
index 0000000..3ec6611
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEFillBorderKernel.h

@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFILLBORDERKERNEL_H__
+#define __ARM_COMPUTE_NEFILLBORDERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to fill borders */
+class NEFillBorderKernel : public INEKernel
+{
+public:
+    /** Default Constructor */
+    NEFillBorderKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFillBorderKernel(const NEFillBorderKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFillBorderKernel &operator=(const NEFillBorderKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEFillBorderKernel(NEFillBorderKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEFillBorderKernel &operator=(NEFillBorderKernel &&) = default;
+    /** Default destructor */
+    ~NEFillBorderKernel() = default;
+
+    /** Initialise the function.
+     *
+     * @note This kernel fills the borders within the XY-planes.
+     *
+     * @param[in,out] tensor                Tensor to process. Data types supported: U8/S8/QS8/QS16/S16/S32/F32.
+     * @param[in]     border_size           Size of the border to fill in elements.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    template <typename T>
+    void fill_replicate_single_channel(const Window &window);
+    template <typename T>
+    void fill_constant_value_single_channel(const Window &window);
+
+    ITensor   *_tensor;
+    BorderSize _border_size;
+    BorderMode _mode;
+    PixelValue _constant_border_value;
+};
+}
+#endif /*__ARM_COMPUTE_NEFILLBORDERKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h
new file mode 100644
index 0000000..61e6e46
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFILLINNERBORDERKERNEL_H__
+#define __ARM_COMPUTE_NEFILLINNERBORDERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to fill the interior borders */
+class NEFillInnerBorderKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEFillInnerBorderKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFillInnerBorderKernel(const NEFillInnerBorderKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFillInnerBorderKernel &operator=(const NEFillInnerBorderKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEFillInnerBorderKernel(NEFillInnerBorderKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEFillInnerBorderKernel &operator=(NEFillInnerBorderKernel &&) = default;
+    /** Default destructor */
+    ~NEFillInnerBorderKernel() = default;
+
+    /** Initialise the function.
+     *
+     * @note This kernel fills the borders within the XY-planes.
+     *
+     * @param[in,out] input                 Tensor to process. Data types supported: U8/QS8/S16/S32/F32.
+     * @param[in]     border_size           Size of the border to fill in elements.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ITensor *input, BorderSize border_size, const PixelValue &constant_border_value = PixelValue());
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    template <typename T>
+    void fill_value_single_channel(const Window &window);
+
+    ITensor   *_tensor;
+    BorderSize _border_size;
+    PixelValue _constant_border_value;
+};
+}
+#endif /*__ARM_COMPUTE_NEFILLINNERBORDERKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
new file mode 100644
index 0000000..b9884ff
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h

@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H__
+#define __ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to interleave the elements of a matrix
+ *
+ * This function puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
+ *
+ * @f[
+ * \left( \begin{array}{cccc}
+ * a00 & a01 & a02 & a03 \\
+ * a10 & a11 & a12 & a13 \\
+ * a20 & a21 & a22 & a23 \\
+ * a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccccccccccc}
+ * a00 & a10 & a20 & a30 & a01 & a11 & a21 & a31 & a02 & a12 & a22 & a32 & a03 & a13 & a23 & a33 \\
+ * \end{array} \right)
+ * @f]
+ *
+ * After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
+ */
+class NEGEMMInterleave4x4Kernel : public INESimpleKernel
+{
+public:
+    /* Constructor */
+    NEGEMMInterleave4x4Kernel();
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
+     * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input.
+     */
+    void configure(const ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the transpose functions
+     *
+     * @param[in]  input  An input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
+     * @param[out] output The output tensor. Data type supported: same as @p input
+     * @param[in]  window Region on which to execute the kernel.
+     */
+    using GEMMInterleaveFunction = void(const ITensor *input, ITensor *output, const Window &window);
+
+    GEMMInterleaveFunction *_func; /**< GEMM interleave function to use for the particular tensor types passed to configure() */
+};
+}
+#endif /*__ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H__*/

diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
new file mode 100644
index 0000000..ba4dcc3
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h

@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H__
+#define __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to multiply matrices
+ *
+ * @note @ref NEGEMMLowpMatrixMultiplyKernel low precision matrix product kernel
+ *  This kernel performs the following computation:
+ *
+ *  -# Convert a values from uint8 to int32 and add a_offset to each of them.
+ *  -# Convert b values from uint8 to int32 and add b_offset to each of them.
+ *  -# Compute the int32 matrix product of the resulting a * b.
+ *  -# Add output_offset to each entry of the result.
+ *  -# Multiply each entry of the result and round to the nearest integer
+ *  -# Clamp the resulting int32 values to the [0..255] range and cast to uint8.
+ *
+ */
+class NEGEMMLowpMatrixMultiplyKernel : public INEKernel
+{
+public:
+    /** Constructor */
+    NEGEMMLowpMatrixMultiplyKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpMatrixMultiplyKernel(const NEGEMMLowpMatrixMultiplyKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpMatrixMultiplyKernel &operator=(const NEGEMMLowpMatrixMultiplyKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpMatrixMultiplyKernel(NEGEMMLowpMatrixMultiplyKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpMatrixMultiplyKernel &operator=(NEGEMMLowpMatrixMultiplyKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * The input matrices @p input0 and @p input1 must be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel. These two
+     * kernels change the layout of the original matrices to be more cache-friendly.
+     *
+     * @param[in]  input0          Input tensor containing the interleaved Matrix A. Data type supported: U8
+     * @param[in]  input1          Input tensor containing the transposed Matrix B. Data type supported: same as @p input0
+     * @param[out] output          Output tensor to store the result of matrix multiplication, Data type supported: same as @p input0
+     * @param[in]  a_offset        Offset to be added to each element of the matrix A.
+     * @param[in]  b_offset        Offset to be added to each element of the matrix B.
+     * @param[in]  output_offset   Offset to be added to each element of the output matrix
+     * @param[in]  output_mult_int Value to be multipied to each entry of the result.
+     * @param[in]  shift           Number of bits to shift right the result.
+     */
+    void configure(const ITensor *input0, const ITensor *input1, ITensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift);
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const ITensor *_input0;
+    const ITensor *_input1;
+    ITensor       *_output;
+    int32_t        _a_offset;
+    int32_t        _b_offset;
+    int32_t        _output_offset;
+    int32_t        _output_mult_int;
+    int32_t        _shift;
+};
+}
+#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H__*/

diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
new file mode 100644
index 0000000..c0ecafc
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h

@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H__
+#define __ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+/** NEON kernel to add a bias to each row of the input tensor */
+class NEGEMMMatrixAccumulateBiasesKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEGEMMMatrixAccumulateBiasesKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMMatrixAccumulateBiasesKernel(const NEGEMMMatrixAccumulateBiasesKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMMatrixAccumulateBiasesKernel &operator=(const NEGEMMMatrixAccumulateBiasesKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMMatrixAccumulateBiasesKernel(NEGEMMMatrixAccumulateBiasesKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMMatrixAccumulateBiasesKernel &operator=(NEGEMMMatrixAccumulateBiasesKernel &&) = default;
+    /** Default destructor */
+    ~NEGEMMMatrixAccumulateBiasesKernel() = default;
+    /** Set the accumulate buffer and the biases of the kernel.
+     *
+     * @param[in, out] accum  The accumulate tensor to convert. Data type supported: QS8/F32
+     * @param[in]      biases The shared biases tensor to append. It must be 1D Tensor. Data type supported: Same as @p input
+     */
+    void configure(ITensor *accum, const ITensor *biases);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    ITensor       *_accum;
+    const ITensor *_biases;
+};
+}
+#endif /*__ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
new file mode 100644
index 0000000..1ab52fa
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h

@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMMATRIXADDITIONKERNEL_H__
+#define __ARM_COMPUTE_NEGEMMMATRIXADDITIONKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
+ *
+ * @note [ MTX_OUT = MTX_0 + beta * MTX_1 ] with MTX_0 and MTX_1 of the same size
+ *
+ * @note This stage is used to finalize the GEMM result and it is computed if and only if beta != 0.0. In case this kernel is used for finalizing GEMM result, we have:
+ *        - MTX_0 = A * B * alpha, where MTX_0 is the output of @ref NEGEMMMatrixMultiplyKernel
+ *        - MTX_1 = C
+ */
+class NEGEMMMatrixAdditionKernel : public INESimpleKernel
+{
+public:
+    /** Constructor */
+    NEGEMMMatrixAdditionKernel();
+    /** Prevent instances of this class from being copied */
+    NEGEMMMatrixAdditionKernel(const NEGEMMMatrixAdditionKernel &) = delete;
+    /** Prevent instances of this class from being copied */
+    NEGEMMMatrixAdditionKernel &operator=(const NEGEMMMatrixAdditionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMMatrixAdditionKernel(NEGEMMMatrixAdditionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMMatrixAdditionKernel &operator=(NEGEMMMatrixAdditionKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @note The input and output tensor must have the same dimensions
+     *
+     * @param[in]      input  Input tensor (Matrix C). Data types supported: QS8/F16/F32
+     * @param[in, out] output Output tensor. If this kernel is used to finalize the GEMM result, output contains the result obtained by the kernel @ref NEGEMMMatrixMultiplyKernel. Data type supported: the same as @p input.
+     * @param[in]      beta   Weight of matrix C
+     */
+    void configure(const ITensor *input, ITensor *output, float beta);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the matrix addition functions
+     *
+     * @param[in]  input  An input tensor. Data types supported: QS8/F16/F32
+     * @param[out] output The output tensor. Data type supported: same as @p input
+     * @param[in]  window Region on which to execute the kernel.
+     * @param[in]  beta   Weight of matrix C
+     */
+    using MatrixAdditionFunction = void(const ITensor *input, ITensor *output, const Window &window, float beta);
+    /** Matrix addition function to use for the particular tensor types passed to configure() */
+    MatrixAdditionFunction *_func;
+    float                   _beta;
+};
+}
+#endif /* __ARM_COMPUTE_NEGEMMMATRIXADDITIONKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
new file mode 100644
index 0000000..a684945
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMMATRIXMULTIPLYKERNEL_H__
+#define __ARM_COMPUTE_NEGEMMMATRIXMULTIPLYKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to multiply two input matrices "A" and "B". All elements of the output matrix/vector will be multiplied by alpha after the matrix multiplication
+ *
+ * @note If the output tensor is a matrix, the implementation assumes that the input tensors @p input0 and @p input1 are both matrices and reshaped respectively with @ref NEGEMMInterleave4x4Kernel" and @ref NEGEMMTranspose1xWKernel
+ * @note If the output tensor is a vector and the data type is F32, the implementation assumes that the first input tensor @p input0 is a vector and the second input tensor @p input1 a matrix. The implementation also assumes that both tensors have not been reshaped
+ *
+ */
+class NEGEMMMatrixMultiplyKernel : public INEKernel
+{
+public:
+    /** Constructor */
+    NEGEMMMatrixMultiplyKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMMatrixMultiplyKernel(const NEGEMMMatrixMultiplyKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMMatrixMultiplyKernel &operator=(const NEGEMMMatrixMultiplyKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMMatrixMultiplyKernel(NEGEMMMatrixMultiplyKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMMatrixMultiplyKernel &operator=(NEGEMMMatrixMultiplyKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @note If the output tensor is a matrix, the input matrices @p input0 and @p input1 should be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel
+     *       These two kernels change the layout of the original matrices to be more cache-friendly.
+     *
+     * @param[in]  input0 Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32
+     * @param[in]  input1 Input tensor containing the transposed Matrix B if the first input tensor A is not a vector.
+     *                    If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0
+     * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
+     * @param[in]  alpha  Weight of the matrix product
+     */
+    void configure(const ITensor *input0, const ITensor *input1, ITensor *output, float alpha);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const ITensor *_input0;
+    const ITensor *_input1;
+    ITensor       *_output;
+    float          _alpha;
+};
+}
+#endif /*__ARM_COMPUTE_NEGEMMMATRIXMULTIPLYKERNEL_H__*/

diff --git a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
new file mode 100644
index 0000000..5d8a369
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h

@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMTRANSPOSE1xWKERNEL_H__
+#define __ARM_COMPUTE_NEGEMMTRANSPOSE1xWKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel which transposes the elements of a matrix in chunks of 1xW, where W is equal to (16 / element size of the tensor)
+ *
+ * Following an example of how the transposition1xW works when the input data is F32
+ *
+ * @f[
+ * \left( \begin{array}{cccc}
+ * a00 & a01 & a02 & a03 \\
+ * a10 & a11 & a12 & a13 \\
+ * a20 & a21 & a22 & a23 \\
+ * a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccccccccccc}
+ * a00 & a01 & a02 & a03 & a10 & a11 & a12 & a13 & a20 & a21 & a22 & a23 & a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * @f]
+ *
+ * Following an example of how the transposition1xW works when the input data type is F16
+ *
+ * @f[
+ * \left( \begin{array}{cccccccc}
+ * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a7 \\
+ * a10 & a11 & a12 & a13 & a14 & a15 & a16 & 17 \\
+ * a20 & a21 & a22 & a23 & a24 & a25 & a26 & 27 \\
+ * a30 & a31 & a32 & a33 & a34 & a35 & a36 & 37 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc}
+ * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a07 & a10 & a11 & a12 & a13 & a14 & a15 & a16 & a17 & a20 & a21 & a22 & a23 & a24 & a25 & a26 & a27 & a30 & a31 & a32 & a33 & a34 & a35 & a36 & a37\\
+ * \end{array} \right)
+ * @f]
+ *
+ * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
+ *
+ */
+class NEGEMMTranspose1xWKernel : public INESimpleKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
+     * @param[out] output Output tensor. Data type supported: same as @p input.
+     */
+    void configure(const ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+};
+}
+#endif /*__ARM_COMPUTE_NEGEMMTRANSPOSE1xWKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h b/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h
new file mode 100644
index 0000000..763fab8
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h

@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGAUSSIAN3x3KERNEL_H__
+#define __ARM_COMPUTE_NEGAUSSIAN3x3KERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform a Gaussian 3x3 filter */
+class NEGaussian3x3Kernel : public INESimpleKernel
+{
+public:
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8
+     * @param[out] output           Destination tensor. Data type supported: S16
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+};
+}
+#endif /*__ARM_COMPUTE_NEGAUSSIAN3x3KERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h b/arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h
new file mode 100644
index 0000000..86b2890
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h

@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGAUSSIAN5x5KERNEL_H__
+#define __ARM_COMPUTE_NEGAUSSIAN5x5KERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform a Gaussian 5x5 filter (horizontal pass) */
+class NEGaussian5x5HorKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NEGaussian5x5HorKernel();
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output           Destination tensor. Data type supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    BorderSize _border_size;
+};
+
+/** NEON kernel to perform a Gaussian 5x5 filter (vertical pass) */
+class NEGaussian5x5VertKernel : public INESimpleKernel
+{
+public:
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: S16.
+     * @param[out] output           Destination tensor, Data type supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+};
+}
+#endif /*__ARM_COMPUTE_NEGAUSSIAN5x5KERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h b/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h
new file mode 100644
index 0000000..40a6aa7
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h

@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H__
+#define __ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform a GaussianPyramid (horizontal pass) */
+class NEGaussianPyramidHorKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NEGaussianPyramidHorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussianPyramidHorKernel(NEGaussianPyramidHorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussianPyramidHorKernel &operator=(NEGaussianPyramidHorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGaussianPyramidHorKernel(NEGaussianPyramidHorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGaussianPyramidHorKernel &operator=(NEGaussianPyramidHorKernel &&) = default;
+    /** Default destructor */
+    ~NEGaussianPyramidHorKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output           Destination tensor. Data type supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    BorderSize _border_size;
+    int        _l2_load_offset;
+};
+
+/** NEON kernel to perform a GaussianPyramid (vertical pass) */
+class NEGaussianPyramidVertKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NEGaussianPyramidVertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussianPyramidVertKernel(NEGaussianPyramidVertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussianPyramidVertKernel &operator=(NEGaussianPyramidVertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGaussianPyramidVertKernel(NEGaussianPyramidVertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGaussianPyramidVertKernel &operator=(NEGaussianPyramidVertKernel &&) = default;
+    /** Default destructor */
+    ~NEGaussianPyramidVertKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: S16.
+     * @param[out] output           Destination tensor. Data type supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    int _t2_load_offset;
+};
+}
+#endif /*__ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h b/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h
new file mode 100644
index 0000000..dd85778
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h

@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEHOGDESCRIPTORKERNEL_H__
+#define __ARM_COMPUTE_NEHOGDESCRIPTORKERNEL_H__
+
+#include "arm_compute/core/IHOG.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Size2D.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform HOG Orientation Binning */
+class NEHOGOrientationBinningKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEHOGOrientationBinningKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGOrientationBinningKernel(const NEHOGOrientationBinningKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGOrientationBinningKernel &operator=(const NEHOGOrientationBinningKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEHOGOrientationBinningKernel(NEHOGOrientationBinningKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEHOGOrientationBinningKernel &operator=(NEHOGOrientationBinningKernel &&) = default;
+    /** Default destructor */
+    ~NEHOGOrientationBinningKernel() = default;
+
+    /**  Initialise the kernel's inputs, output and HOG's metadata
+     *
+     * @param[in]  input_magnitude Input tensor which stores the magnitude of the gradient for each pixel. Data type supported: S16.
+     * @param[in]  input_phase     Input tensor which stores the phase of the gradient for each pixel. Data type supported: U8
+     * @param[out] output          Output tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
+     * @param[in]  hog_info        HOG's metadata
+     */
+    void configure(const ITensor *input_magnitude, const ITensor *input_phase, ITensor *output, const HOGInfo *hog_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the specialised block normalization functions
+     *
+     * @param[in]  mag_row_ptr   Pointer to the first row of the cell in the magnitude tensor
+     * @param[in]  phase_row_ptr Pointer to the first row of the cell in the phase tensor
+     * @param[out] output_ptr    Pointer to the output cell of hog space tensor
+     * @param[in]  mag_stride    Stride of the magnitude tensor
+     * @param[in]  phase_stride  Stride of the phase tensor
+     * @param[in]  cell_width    Width of the cell
+     * @param[in]  cell_height   Height of the cell
+     * @param[in]  num_bins      Number of bins for each cell
+     * @param[in]  phase_scale   Scale factor to apply to the phase in order to calculate the histogram index
+     */
+    using OrientBinFunc = void(const int16_t *__restrict mag_row_ptr, const uint8_t *__restrict phase_row_ptr, float *__restrict output_ptr, size_t mag_stride, size_t phase_stride, size_t cell_width,
+                               size_t cell_height, size_t num_bins, float phase_scale);
+    /** Orientation binning function to use for the particular cell width passed to configure() */
+    OrientBinFunc *_func;
+    const ITensor *_input_magnitude;
+    const ITensor *_input_phase;
+    ITensor       *_output;
+    size_t         _cell_width;
+    size_t         _cell_height;
+    size_t         _num_bins;
+    float          _phase_scale;
+};
+
+/** NEON kernel to perform HOG block normalization */
+class NEHOGBlockNormalizationKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEHOGBlockNormalizationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGBlockNormalizationKernel(const NEHOGBlockNormalizationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGBlockNormalizationKernel &operator=(const NEHOGBlockNormalizationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEHOGBlockNormalizationKernel(NEHOGBlockNormalizationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEHOGBlockNormalizationKernel &operator=(NEHOGBlockNormalizationKernel &&) = default;
+    /** Default destructor */
+    ~NEHOGBlockNormalizationKernel() = default;
+
+    /** Initialise the kernel's input, output and HOG's metadata
+     *
+     * @param[in]  input    Input tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
+     * @param[out] output   Output tensor which stores the normalised blocks. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
+     * @param[in]  hog_info HOG's metadata
+     */
+    void configure(const ITensor *input, ITensor *output, const HOGInfo *hog_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the specialised block normalization functions
+     *
+     * @param[in]  input_row_ptr              Pointer to the first row of the block in the input hog space tensor
+     * @param[out] output_ptr                 Pointer to the output block of the hog normalized space
+     * @param[in]  input_stride               Stride of the input hog space tensor
+     * @param[in]  num_cells_per_block_height Number of cells per block along the Y direction
+     * @param[in]  num_bins_block_x           Number of bins per block along the X direction
+     * @param[in]  num_bins_block             Number of total bins per block
+     * @param[in]  l2_hyst_threshold          Threshold to use for l2 hysteresis normalization
+     */
+    using BlockNormFunc = void(const float *input_row_ptr, float *output_ptr, size_t input_stride, size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block,
+                               float l2_hyst_threshold);
+    /** Block normalization function to use for the particular normalization type passed to configure() */
+    BlockNormFunc *_func;
+    const ITensor *_input;
+    ITensor       *_output;
+    Size2D         _num_cells_per_block;
+    Size2D         _num_cells_per_block_stride;
+    size_t         _num_bins;
+    float          _l2_hyst_threshold;
+};
+}
+#endif /* __ARM_COMPUTE_NEHOGDESCRIPTORKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h b/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h
new file mode 100644
index 0000000..e56d1e5
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h

@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEHOGDETECTORKERNEL_H__
+#define __ARM_COMPUTE_NEHOGDETECTORKERNEL_H__
+
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/IHOG.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <mutex>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform HOG detector kernel using linear SVM */
+class NEHOGDetectorKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEHOGDetectorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGDetectorKernel(const NEHOGDetectorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGDetectorKernel &operator=(const NEHOGDetectorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEHOGDetectorKernel(NEHOGDetectorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEHOGDetectorKernel &operator=(NEHOGDetectorKernel &&) = default;
+    /** Default destructor */
+    ~NEHOGDetectorKernel() = default;
+
+    /** Initialise the kernel's input, HOG data-object, detection window, the stride of the detection window, the threshold and index of the object to detect
+     *
+     * @param[in]  input                   Input tensor which stores the HOG descriptor obtained with @ref NEHOGOrientationBinningKernel. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
+     * @param[in]  hog                     HOG data object used by @ref NEHOGOrientationBinningKernel and  @ref NEHOGBlockNormalizationKernel
+     * @param[out] detection_windows       Array of @ref DetectionWindow. This array stores all the detected objects
+     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
+     *                                     It must be multiple of the hog->info()->block_stride()
+     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
+     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
+     */
+    void configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold = 0.0f, uint16_t idx_class = 0);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const ITensor         *_input;
+    IDetectionWindowArray *_detection_windows;
+    const float           *_hog_descriptor;
+    float                  _bias;
+    float                  _threshold;
+    uint16_t               _idx_class;
+    size_t                 _num_bins_per_descriptor_x;
+    size_t                 _num_blocks_per_descriptor_y;
+    size_t                 _block_stride_width;
+    size_t                 _block_stride_height;
+    size_t                 _detection_window_width;
+    size_t                 _detection_window_height;
+    size_t                 _max_num_detection_windows;
+    std::mutex             _mutex;
+};
+}
+
+#endif /* __ARM_COMPUTE_NEHOGDETECTORKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h b/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
new file mode 100644
index 0000000..0abd73e
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h

@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEHARRISCORNERSKERNEL_H__
+#define __ARM_COMPUTE_NEHARRISCORNERSKERNEL_H__
+
+#include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h"
+#include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h"
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <cstdint>
+#include <mutex>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** Common interface for all Harris Score kernels */
+class INEHarrisScoreKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    INEHarrisScoreKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INEHarrisScoreKernel(const INEHarrisScoreKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INEHarrisScoreKernel &operator=(const INEHarrisScoreKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    INEHarrisScoreKernel(INEHarrisScoreKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    INEHarrisScoreKernel &operator=(INEHarrisScoreKernel &&) = default;
+    /** Default destructor */
+    ~INEHarrisScoreKernel() = default;
+
+public:
+    /** Setup the kernel parameters
+     *
+     * @param[in]  input1           Source image (gradient X). Data types supported: S16/S32
+     * @param[in]  input2           Source image (gradient Y). Data types supported: same as @ input1
+     * @param[out] output           Destination image (harris score). Data types supported: F32
+     * @param[in]  norm_factor      Normalization factor to use accordingly with the gradient size (Must be different from 0)
+     * @param[in]  strength_thresh  Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
+     * @param[in]  sensitivity      Sensitivity threshold k from the Harris-Stephens equation
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    virtual void configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity, bool border_undefined) = 0;
+
+protected:
+    const IImage *_input1;          /**< Source image - Gx component */
+    const IImage *_input2;          /**< Source image - Gy component */
+    IImage       *_output;          /**< Source image - Harris score */
+    float         _sensitivity;     /**< Sensitivity value */
+    float         _strength_thresh; /**< Threshold value */
+    float         _norm_factor;     /**< Normalization factor */
+    BorderSize    _border_size;     /**< Border size */
+};
+
+/** Template NEON kernel to perform Harris Score.
+ *  The implementation supports 3, 5, and 7 for the block_size
+ */
+template <int32_t block_size>
+class NEHarrisScoreKernel : public INEHarrisScoreKernel
+{
+public:
+    /** Default constructor */
+    NEHarrisScoreKernel();
+    // Inherited methods overridden:
+    void configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity, bool border_undefined) override;
+    BorderSize border_size() const override;
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the specialised harris score functions */
+    using HarrisScoreFunction = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
+                                     float norm_factor, float sensitivity, float strength_thresh);
+    /** Harris Score function to use for the particular image types passed to configure() */
+    HarrisScoreFunction *_func;
+};
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+/** Interface for the accumulate Weighted kernel using F16 */
+template <int32_t block_size>
+class NEHarrisScoreFP16Kernel : public INEHarrisScoreKernel
+{
+public:
+    /** Default constructor */
+    NEHarrisScoreFP16Kernel();
+    // Inherited methods overridden:
+    void configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity, bool border_undefined) override;
+    BorderSize border_size() const override;
+    void run(const Window &window) override;
+
+private:
+    using HarrisScoreFunction = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
+                                     float norm_factor, float sensitivity, float strength_thresh);
+    /** Harris Score function to use for the particular image types passed to configure() */
+    HarrisScoreFunction *_func;
+};
+#else
+template <int32_t block_size>
+using NEHarrisScoreFP16Kernel = NEHarrisScoreKernel<block_size>;
+#endif
+}
+#endif /* __ARM_COMPUTE_NEHARRISCORNERSKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEHistogramKernel.h b/arm_compute/core/NEON/kernels/NEHistogramKernel.h
new file mode 100644
index 0000000..c4dbbea
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEHistogramKernel.h

@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEHISTOGRAMKERNEL_H__
+#define __ARM_COMPUTE_NEHISTOGRAMKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <mutex>
+
+namespace arm_compute
+{
+class IDistribution1D;
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the histogram kernel */
+class NEHistogramKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEHistogramKernel();
+    /** Default destructor */
+    ~NEHistogramKernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHistogramKernel(const NEHistogramKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHistogramKernel &operator=(const NEHistogramKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEHistogramKernel(NEHistogramKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEHistogramKernel &operator=(NEHistogramKernel &&) = default;
+
+    /** Set the input image and the distribution output.
+     *
+     * @param[in]     input      Source image. Data type supported: U8.
+     * @param[out]    output     Destination distribution.
+     * @param[in,out] local_hist Array that the threads use to save their local histograms.
+     *                           It's size should be equal to (number_of_threads * num_bins),
+     *                           and the Window::thread_id() is used to determine the part of the array
+     *                           used by each thread.
+     * @param[out]    window_lut LUT with pre-calculated possible window values.
+     *                           The size of the LUT should be equal to max_range_size and it will be filled
+     *                           during the configure stage, while it re-used in every run, therefore can be
+     *                           safely shared among threads.
+     */
+    void configure(const IImage *input, IDistribution1D *output, uint32_t *local_hist, uint32_t *window_lut);
+    /** Set the input image and the distribution output.
+     *
+     * @note Used for histogram of fixed size equal to 256
+     *
+     * @param[in]  input  Source image. Data type supported: U8.
+     * @param[out] output Destination distribution which must be of 256 bins..
+     */
+    void configure(const IImage *input, IDistribution1D *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Function to merge multiple partial histograms.
+     *
+     *  @param[out] global_hist Pointer to the final histogram.
+     *  @param[in]  local_hist  Pointer to the partial histograms.
+     *  @param[in]  bins        Number of bins.
+     */
+    void merge_histogram(uint32_t *global_hist, const uint32_t *local_hist, size_t bins);
+    /** Function to merge multiple minimum values of partial histograms.
+     *
+     *  @param[out] global_min Pointer to the global min value.
+     *  @param[in]  local_min  Local min value.
+     */
+    void merge_min(uint8_t *global_min, const uint8_t &local_min);
+    /** Function to perform histogram on the given window
+      *
+     *  @param[in] win Region on which to execute the kernel
+     */
+    void histogram_U8(Window win);
+    /** Function to perform histogram on the given window where histogram is
+     *         of fixed size 256 without ranges and offsets.
+     *
+     *  @param[in] win Region on which to execute the kernel
+     */
+    void histogram_fixed_U8(Window win);
+    /** Pre-calculate the pixel windowing for every possible pixel
+     *
+     * Calculate (V - offset) * numBins / range where V is every possible pixel value.
+     *
+     * @note We currently support U8 image thus possible pixel values are between 0 and 255
+     */
+    void calculate_window_lut() const;
+    /** Common signature for all the specialised Histogram functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using HistogramFunctionPtr = void (NEHistogramKernel::*)(Window window);
+
+    HistogramFunctionPtr          _func; ///< Histogram function to use for the particular image types passed to configure()
+    const IImage                 *_input;
+    IDistribution1D              *_output;
+    uint32_t                     *_local_hist;
+    uint32_t                     *_window_lut;
+    std::mutex                    _hist_mtx;
+    static constexpr unsigned int _max_range_size{ 256 }; ///< 256 possible pixel values as we handle only U8 images
+};
+}
+#endif /*__ARM_COMPUTE_NEHISTOGRAMKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
new file mode 100644
index 0000000..ebaafb4
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h

@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEIM2COLKERNEL_H__
+#define __ARM_COMPUTE_NEIM2COLKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the im2col reshape kernel.
+ *
+ * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column.
+ * It is used to transform a convolution to a plain matrix multiplication.
+ *
+ * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have:
+ *
+ * @f[
+ * \left( \begin{array}{cccc}
+ * a00 & a01 & a02 & a03 \\
+ * a10 & a11 & a12 & a13 \\
+ * a20 & a21 & a22 & a23 \\
+ * a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccc}
+ * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\
+ * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\
+ * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\
+ * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * @f]
+ */
+class NEIm2ColKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEIm2ColKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEIm2ColKernel(const NEIm2ColKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEIm2ColKernel &operator=(const NEIm2ColKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEIm2ColKernel(NEIm2ColKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEIm2ColKernel &operator=(NEIm2ColKernel &&) = default;
+    /** Default destructor */
+    ~NEIm2ColKernel() = default;
+
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input          The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
+     *                            while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/F32
+     * @param[out] output         The output tensor. Data types supported: Same as @p input
+     * @param[in]  convolved_dims The convolved output dimensions.
+     * @param[in]  conv_info      Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  has_bias       In case biases are provided expands the matrix with 1.
+     */
+    void configure(const ITensor *input, ITensor *output, std::pair<unsigned int, unsigned int> convolved_dims, const PadStrideInfo &conv_info, bool has_bias);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Template function to run the im2col optimised for the fully connected layer case
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename T>
+    void run_reduced(const Window &window);
+    /** Template function to run the im2col used for the convolution layer case
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename T, bool has_pads>
+    void run_generic(const Window &window);
+    /** Common signature for all the specialised im2col functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using Im2ColFunctionPtr = void (NEIm2ColKernel::*)(const Window &window);
+
+    Im2ColFunctionPtr _func;
+    const ITensor    *_input;
+    ITensor          *_output;
+    std::pair<unsigned int, unsigned int> _convolved_dims;
+    PadStrideInfo _conv_info;
+    unsigned int  _kernel_size;
+    bool          _has_bias;
+};
+}
+#endif /*__ARM_COMPUTE_NEIM2COLKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEIntegralImageKernel.h b/arm_compute/core/NEON/kernels/NEIntegralImageKernel.h
new file mode 100644
index 0000000..1364788
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEIntegralImageKernel.h

@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEINTEGRALIMAGEKERNEL_H__
+#define __ARM_COMPUTE_NEINTEGRALIMAGEKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Kernel to perform an image integral on an image */
+class NEIntegralImageKernel : public INESimpleKernel
+{
+public:
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input  Source tensor. Data type supported: U8
+     * @param[out] output Destination tensor. Data type supported: U32
+     */
+    void configure(const ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+    bool       is_parallelisable() const override;
+};
+}
+#endif /*__ARM_COMPUTE_NEINTEGRALIMAGEKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NELKTrackerKernel.h b/arm_compute/core/NEON/kernels/NELKTrackerKernel.h
new file mode 100644
index 0000000..9ab7f91
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NELKTrackerKernel.h

@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_LKTRACKERKERNEL_H__
+#define __ARM_COMPUTE_LKTRACKERKERNEL_H__
+
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+#include <utility>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Internal keypoint class for Lucas-Kanade Optical Flow */
+struct NELKInternalKeypoint
+{
+    float x{ 0.f };                 /**< x coordinate of the keypoint */
+    float y{ 0.f };                 /**< y coordinate of the keypoint */
+    bool  tracking_status{ false }; /**< the tracking status of the keypoint */
+};
+
+using INELKInternalKeypointArray = IArray<NELKInternalKeypoint>;
+
+/** Interface for the Lucas-Kanade tracker kernel */
+class NELKTrackerKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NELKTrackerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELKTrackerKernel(const NELKTrackerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELKTrackerKernel &operator=(const NELKTrackerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NELKTrackerKernel(NELKTrackerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NELKTrackerKernel &operator=(NELKTrackerKernel &&) = default;
+    /** Default destructor */
+    ~NELKTrackerKernel() = default;
+
+    /** Initialise the kernel input and output
+     *
+     * @param[in]      input_old            Pointer to the input old tensor. Data type supported: U8
+     * @param[in]      input_new            Pointer to the input new tensor. Data type supported. U8
+     * @param[in]      old_scharr_gx        Pointer to the input scharr X tensor. Data type supported: S16
+     * @param[in]      old_scharr_gy        Pointer to the input scharr Y tensor. Data type supported: S16
+     * @param[in]      old_points           Pointer to the IKeyPointArray storing old key points
+     * @param[in]      new_points_estimates Pointer to the IKeyPointArray storing new estimates key points
+     * @param[out]     new_points           Pointer to the IKeyPointArray storing new key points
+     * @param[in, out] old_points_internal  Pointer to the array of NELKInternalKeypoint for old points
+     * @param[out]     new_points_internal  Pointer to the array of NELKInternalKeypoint for new points
+     * @param[in]      termination          The criteria to terminate the search of each keypoint.
+     * @param[in]      use_initial_estimate The flag to indicate whether the initial estimated position should be used
+     * @param[in]      epsilon              The error for terminating the algorithm
+     * @param[in]      num_iterations       The maximum number of iterations before terminate the algorithm
+     * @param[in]      window_dimension     The size of the window on which to perform the algorithm
+     * @param[in]      level                The pyramid level
+     * @param[in]      num_levels           The number of pyramid levels
+     * @param[in]      pyramid_scale        Scale factor used for generating the pyramid
+     */
+    void configure(const ITensor *input_old, const ITensor *input_new, const ITensor *old_scharr_gx, const ITensor *old_scharr_gy,
+                   const IKeyPointArray *old_points, const IKeyPointArray *new_points_estimates, IKeyPointArray *new_points,
+                   INELKInternalKeypointArray *old_points_internal, INELKInternalKeypointArray *new_points_internal,
+                   Termination termination, bool use_initial_estimate, float epsilon, unsigned int num_iterations, size_t window_dimension,
+                   size_t level, size_t num_levels, float pyramid_scale);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Initialise the array of keypoints in the provide range
+     *
+     * @param[in] start Index of first element in the keypoints array to be initialised
+     * @param[in] end   Index after last elelemnt in the keypoints array to be initialised
+     */
+    void init_keypoints(int start, int end);
+    /** Compute the structure tensor A^T * A based on the scharr gradients I_x and I_y
+     *
+     * @param[in]  keypoint    Keypoint for which gradients are computed
+     * @param[out] bilinear_ix Intermediate interpolated data for X gradient
+     * @param[out] bilinear_iy Intermediate interpolated data for Y gradient
+     *
+     * @return Values A11, A12, A22
+     */
+    std::tuple<int, int, int> compute_spatial_gradient_matrix(const NELKInternalKeypoint &keypoint, int *bilinear_ix, int *bilinear_iy);
+    /** Compute the vector A^T * b, i.e. -sum(I_d * I_t) for d in {x,y}
+     *
+     * @param[in] old_keypoint Old keypoint for which gradient is computed
+     * @param[in] new_keypoint New keypoint for which gradient is computed
+     * @param[in] bilinear_ix  Intermediate interpolated data for X gradient
+     * @param[in] bilinear_iy  Intermediate interpolated data for Y gradient
+     *
+     * @return Values b1, b2
+     */
+    std::pair<int, int> compute_image_mismatch_vector(const NELKInternalKeypoint &old_keypoint, const NELKInternalKeypoint &new_keypoint, const int *bilinear_ix, const int *bilinear_iy);
+
+    const ITensor              *_input_old;
+    const ITensor              *_input_new;
+    const ITensor              *_old_scharr_gx;
+    const ITensor              *_old_scharr_gy;
+    IKeyPointArray             *_new_points;
+    const IKeyPointArray       *_new_points_estimates;
+    const IKeyPointArray       *_old_points;
+    INELKInternalKeypointArray *_old_points_internal;
+    INELKInternalKeypointArray *_new_points_internal;
+    Termination                 _termination;
+    bool                        _use_initial_estimate;
+    float                       _pyramid_scale;
+    float                       _epsilon;
+    unsigned int                _num_iterations;
+    int                         _window_dimension;
+    unsigned int                _level;
+    unsigned int                _num_levels;
+    ValidRegion                 _valid_region;
+};
+}
+#endif /*__ARM_COMPUTE_NELKTRACKERKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h
new file mode 100644
index 0000000..d4bff66
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h

@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NELOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H__
+#define __ARM_COMPUTE_NELOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to multiply each row of first tensor with low 2 dimensions of second tensor. */
+class NELocallyConnectedMatrixMultiplyKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NELocallyConnectedMatrixMultiplyKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELocallyConnectedMatrixMultiplyKernel(const NELocallyConnectedMatrixMultiplyKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELocallyConnectedMatrixMultiplyKernel &operator=(const NELocallyConnectedMatrixMultiplyKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NELocallyConnectedMatrixMultiplyKernel(NELocallyConnectedMatrixMultiplyKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NELocallyConnectedMatrixMultiplyKernel &operator=(NELocallyConnectedMatrixMultiplyKernel &&) = default;
+    /** Initialise the kernel's input and output
+     *
+     * @param[in]  input0 First input tensor. Data types supported: F32
+     * @param[in]  input1 Second input tensor containing the Matrix B. Data type supported: same as @p input0
+     * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
+     */
+    void configure(const ITensor *input0, const ITensor *input1, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const ITensor *_input0;
+    const ITensor *_input1;
+    ITensor       *_output;
+};
+}
+#endif /* __ARM_COMPUTE_NELOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h b/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h
new file mode 100644
index 0000000..5d49901
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h

@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H__
+#define __ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Template interface for the kernel to compute magnitude and phase */
+template <MagnitudeType mag_type, PhaseType phase_type>
+class NEMagnitudePhaseKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEMagnitudePhaseKernel();
+    /** Destructor */
+    ~NEMagnitudePhaseKernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMagnitudePhaseKernel(const NEMagnitudePhaseKernel &) = delete;
+    /** Default move constructor */
+    NEMagnitudePhaseKernel(NEMagnitudePhaseKernel &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMagnitudePhaseKernel &operator=(const NEMagnitudePhaseKernel &) = delete;
+    /** Default move assignment operator */
+    NEMagnitudePhaseKernel &operator=(NEMagnitudePhaseKernel &&) = default;
+
+    /** Initialise the kernel's input, output.
+     *
+     * @note At least one of out1 or out2 must be set
+     *
+     * @param[in]  gx        Gradient X tensor. Data type supported: S16.
+     * @param[in]  gy        Gradient Y tensor. Data type supported: S16.
+     * @param[out] magnitude (Optional) The output tensor - Magnitude. Data type supported: S16.
+     * @param[out] phase     (Optional) The output tensor - Phase. Data type supported: U8.
+     */
+    void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Function to perform magnitude on the given window
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    void magnitude(const Window &window);
+    /** Function to perform phase on the given window
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    void phase(const Window &window);
+    /** Function to perform magnitude and phase on the given window
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    void magnitude_phase(const Window &window);
+
+private:
+    /** Common signature for all the specialised MagnitudePhase functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using MagnitudePhaseFunctionPtr = void (NEMagnitudePhaseKernel::*)(const Window &window);
+    /** MagnitudePhase function to use for the particular formats passed to configure() */
+    MagnitudePhaseFunctionPtr _func;
+    const ITensor            *_gx;        /**< Input gradient X */
+    const ITensor            *_gy;        /**< Input gradient Y */
+    ITensor                  *_magnitude; /**< Output - Magnitude */
+    ITensor                  *_phase;     /**< Output - Phase */
+};
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+/** Template interface for the kernel to compute magnitude and phase */
+template <MagnitudeType mag_type, PhaseType phase_type>
+class NEMagnitudePhaseFP16Kernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEMagnitudePhaseFP16Kernel();
+    /** Destructor */
+    ~NEMagnitudePhaseFP16Kernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMagnitudePhaseFP16Kernel(const NEMagnitudePhaseFP16Kernel &) = delete;
+    /** Default move constructor */
+    NEMagnitudePhaseFP16Kernel(NEMagnitudePhaseFP16Kernel &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMagnitudePhaseFP16Kernel &operator=(const NEMagnitudePhaseFP16Kernel &) = delete;
+    /** Default move assignment operator */
+    NEMagnitudePhaseFP16Kernel &operator=(NEMagnitudePhaseFP16Kernel &&) = default;
+
+    /** Initialise the kernel's input, output.
+     *
+     * @note At least one of out1 or out2 must be set
+     *
+     * @param[in]  gx        Gradient X tensor. Data type supported: S16.
+     * @param[in]  gy        Gradient Y tensor. Data type supported: S16.
+     * @param[out] magnitude (Optional) The output tensor - Magnitude. Data type supported: S16.
+     * @param[out] phase     (Optional) The output tensor - Phase. Data type supported: U8.
+     */
+    void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Function to perform magnitude on the given window
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    void magnitude(const Window &window);
+    /** Function to perform phase on the given window
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    void phase(const Window &window);
+    /** Function to perform magnitude and phase on the given window
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    void magnitude_phase(const Window &window);
+
+    /** Common signature for all the specialised MagnitudePhase functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using MagnitudePhaseFunctionPtr = void (NEMagnitudePhaseFP16Kernel::*)(const Window &window);
+    /** MagnitudePhase function to use for the particular formats passed to configure() */
+    MagnitudePhaseFunctionPtr _func;
+    const ITensor            *_gx;        /**< Input gradient X */
+    const ITensor            *_gy;        /**< Input gradient Y */
+    ITensor                  *_magnitude; /**< Output - Magnitude */
+    ITensor                  *_phase;     /**< Output - Phase */
+};
+#else
+template <MagnitudeType mag_type, PhaseType phase_type>
+using NEMagnitudePhaseFP16Kernel = NEMagnitudePhaseKernel<mag_type, phase_type>;
+#endif
+}
+#endif /* __ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h b/arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h
new file mode 100644
index 0000000..83407cc
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h

@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEMEANSTDDEVKERNEL_H__
+#define __ARM_COMPUTE_NEMEANSTDDEVKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <cstdint>
+#include <mutex>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the kernel to calculate mean and standard deviation of input image pixels. */
+class NEMeanStdDevKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEMeanStdDevKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMeanStdDevKernel(const NEMeanStdDevKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMeanStdDevKernel &operator=(const NEMeanStdDevKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEMeanStdDevKernel(NEMeanStdDevKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEMeanStdDevKernel &operator=(NEMeanStdDevKernel &&) = default;
+    /** Default destructor */
+    ~NEMeanStdDevKernel() = default;
+
+    /** Initialise the kernel's input and outputs.
+     *
+     * @param[in]  input              Input image. Data type supported: U8.
+     * @param[out] mean               Input average pixel value.
+     * @param[out] global_sum         Keeps global sum of pixel values.
+     * @param[out] stddev             (Optional) Output standard deviation of pixel values.
+     * @param[out] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values.
+     */
+    void configure(const IImage *input, float *mean, uint64_t *global_sum, float *stddev = nullptr, uint64_t *global_sum_squared = nullptr);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const IImage *_input;
+    float        *_mean;
+    float        *_stddev;
+    uint64_t     *_global_sum;
+    uint64_t     *_global_sum_squared;
+    std::mutex    _mtx;
+};
+}
+#endif /* __ARM_COMPUTE_NEMEANSTDDEVKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h b/arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h
new file mode 100644
index 0000000..dee1aad
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h

@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEMEDIAN3x3KERNEL_H__
+#define __ARM_COMPUTE_NEMEDIAN3x3KERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Kernel to perform a median filter on a tensor */
+class NEMedian3x3Kernel : public INESimpleKernel
+{
+public:
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8
+     * @param[out] output           Destination tensor. Data type supported: U8
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+};
+}
+#endif /*__ARM_COMPUTE_NEMEDIAN3x3KERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h b/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h
new file mode 100644
index 0000000..e405ea5
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h

@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEMINMAXLOCATIONKERNEL_H__
+#define __ARM_COMPUTE_NEMINMAXLOCATIONKERNEL_H__
+
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <cstdint>
+#include <mutex>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the kernel to perform min max search on an image. */
+class NEMinMaxKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEMinMaxKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMinMaxKernel(const NEMinMaxKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMinMaxKernel &operator=(const NEMinMaxKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEMinMaxKernel(NEMinMaxKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEMinMaxKernel &operator=(NEMinMaxKernel &&) = default;
+    /** Default destructor */
+    ~NEMinMaxKernel() = default;
+
+    /** Initialise the kernel's input and outputs.
+     *
+     * @param[in]  input Input Image. Data types supported: U8/S16.
+     * @param[out] min   Minimum value of image.
+     * @param[out] max   Maximum value of image.
+     */
+    void configure(const IImage *input, int32_t *min, int32_t *max);
+    /** Resets global minimum and maximum. */
+    void reset();
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Performs the min/max algorithm on U8 images on a given window.
+     *
+     * @param win The window to run the algorithm on.
+     */
+    void minmax_U8(const Window &win);
+    /** Performs the min/max algorithm on S16 images on a given window.
+     *
+     * @param win The window to run the algorithm on.
+     */
+    void minmax_S16(const Window &win);
+    /** Common signature for all the specialised MinMax functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using MinMaxFunction = void (NEMinMaxKernel::*)(const Window &window);
+    /** MinMax function to use for the particular image types passed to configure() */
+    MinMaxFunction _func;
+    /** Helper to update min/max values **/
+    template <typename T>
+    void update_min_max(T min, T max);
+
+    const IImage *_input;    /**< Input image. */
+    int32_t      *_min;      /**< Minimum value. */
+    int32_t      *_max;      /**< Maximum value. */
+    int32_t       _min_init; /**< Value to initialise global minimum value. */
+    int32_t       _max_init; /**< Value to initialise global maximum value. */
+    std::mutex    _mtx;      /**< Mutex used for result reduction. */
+};
+
+/** Interface for the kernel to find min max locations of an image. */
+class NEMinMaxLocationKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEMinMaxLocationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMinMaxLocationKernel(const NEMinMaxLocationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMinMaxLocationKernel &operator=(const NEMinMaxLocationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEMinMaxLocationKernel(NEMinMaxLocationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEMinMaxLocationKernel &operator=(NEMinMaxLocationKernel &&) = default;
+    /** Default destructor */
+    ~NEMinMaxLocationKernel() = default;
+
+    /** Initialise the kernel's input and outputs.
+     *
+     * @param[in]  input     Input Image. Data types supported: U8 or S16.
+     * @param[out] min       Minimum value of image.
+     * @param[out] max       Maximum value of image.
+     * @param[out] min_loc   Array of minimum value locations.
+     * @param[out] max_loc   Array of maximum value locations.
+     * @param[out] min_count Number of minimum value encounters.
+     * @param[out] max_count Number of maximum value encounters.
+     */
+    void configure(const IImage *input, int32_t *min, int32_t *max,
+                   ICoordinates2DArray *min_loc = nullptr, ICoordinates2DArray *max_loc = nullptr,
+                   uint32_t *min_count = nullptr, uint32_t *max_count = nullptr);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    bool is_parallelisable() const override;
+
+private:
+    /** Performs the min/max location algorithm on T type images on a given window.
+     *
+     * @param win The window to run the algorithm on.
+     */
+    template <class T, bool count_min, bool count_max, bool loc_min, bool loc_max>
+    void minmax_loc(const Window &win);
+    /** Common signature for all the specialised MinMaxLoc functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using MinMaxLocFunction = void (NEMinMaxLocationKernel::*)(const Window &window);
+    /** MinMaxLoc function to use for the particular image types passed to configure() */
+    MinMaxLocFunction _func;
+    /** Helper to create a function pointer table for the parameterized MinMaxLocation functions. */
+    template <class T, typename>
+    struct create_func_table;
+
+    const IImage        *_input;                             /**< Input image. */
+    int32_t             *_min;                               /**< Minimum value. */
+    int32_t             *_max;                               /**< Maximum value. */
+    uint32_t            *_min_count;                         /**< Count of minimum value encounters. */
+    uint32_t            *_max_count;                         /**< Count of maximum value encounters. */
+    ICoordinates2DArray *_min_loc;                           /**< Locations of minimum values. */
+    ICoordinates2DArray *_max_loc;                           /**< Locations of maximum values. */
+    unsigned int         _num_elems_processed_per_iteration; /**< Elements processed per iteration. */
+};
+}
+#endif /*__ARM_COMPUTE_NEMINMAXLOCATIONKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h b/arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h
new file mode 100644
index 0000000..ede0294
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h

@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NENONLINEARFILTERKERNEL_H__
+#define __ARM_COMPUTE_NENONLINEARFILTERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to apply a non-linear filter */
+class NENonLinearFilterKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NENonLinearFilterKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENonLinearFilterKernel(NENonLinearFilterKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENonLinearFilterKernel &operator=(NENonLinearFilterKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NENonLinearFilterKernel(NENonLinearFilterKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NENonLinearFilterKernel &operator=(NENonLinearFilterKernel &&) = default;
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8
+     * @param[out] output           Destination tensor. Data type supported: U8
+     * @param[in]  function         Non linear function to perform
+     * @param[in]  mask_size        Mask size. Supported sizes: 3, 5
+     * @param[in]  pattern          Mask pattern
+     * @param[in]  mask             The given mask. Will be used only if pattern is specified to PATTERN_OTHER
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Fill mask with the corresponding given pattern.
+     *
+     * @param[in,out] mask    Mask to be filled according to pattern
+     * @param[in]     cols    Columns (width) of mask
+     * @param[in]     rows    Rows (height) of mask
+     * @param[in]     pattern Pattern to fill the mask according to
+     */
+    void fill_mask(uint8_t *mask, int cols, int rows, MatrixPattern pattern);
+    /** Apply a median filter when given mask pattern is defined as box.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void median_filter_box(const Window &win);
+    /** Apply a min filter when given mask pattern is defined as box.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void min_filter_box(const Window &win);
+    /** Apply a max filter when given mask pattern is defined as box.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void max_filter_box(const Window &win);
+    /** Apply a median filter when given mask pattern is defined as cross.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void median_filter_cross(const Window &win);
+    /** Apply a min filter when given mask pattern is defined as cross.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void min_filter_cross(const Window &win);
+    /** Apply a max filter when given mask pattern is defined as cross.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void max_filter_cross(const Window &win);
+    /** Apply a median filter when given mask pattern is defined as disk.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void median_filter_disk(const Window &win);
+    /** Apply a min filter when given mask pattern is defined as disk.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void min_filter_disk(const Window &win);
+    /** Apply a max filter when given mask pattern is defined as disk.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void max_filter_disk(const Window &win);
+    /** Apply a non-linear filter when given mask has user-defined pattern.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void non_linear_filter_generic(const Window &win);
+
+private:
+    unsigned int            _border_width;
+    const ITensor          *_input;
+    ITensor                *_output;
+    const uint8_t          *_mask;
+    MatrixPattern           _pattern;
+    NonLinearFilterFunction _function;
+    unsigned int            _func_idx;
+    BorderSize              _border_size;
+};
+}
+#endif /*__ARM_COMPUTE_NENONLINEARFILTERKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h b/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
new file mode 100644
index 0000000..0daae59
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h

@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H__
+#define __ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface to perform Non-Maxima suppression over a 3x3 window using NEON
+ *
+ * @note Used by @ref NEFastCorners and @ref NEHarrisCorners
+ */
+class NENonMaximaSuppression3x3Kernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NENonMaximaSuppression3x3Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENonMaximaSuppression3x3Kernel(const NENonMaximaSuppression3x3Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENonMaximaSuppression3x3Kernel &operator=(const NENonMaximaSuppression3x3Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NENonMaximaSuppression3x3Kernel(NENonMaximaSuppression3x3Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NENonMaximaSuppression3x3Kernel &operator=(NENonMaximaSuppression3x3Kernel &&) = default;
+    /** Default destructor */
+    ~NENonMaximaSuppression3x3Kernel() = default;
+
+    /** Initialise the kernel's sources, destinations and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8/F32
+     * @param[out] output           Destination tensor. Data types supported: same as @p input
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+protected:
+    /** Common signature for all the specialised non-maxima suppression 3x3 functions
+     *
+     * @param[in]  input_ptr    Pointer to the input tensor.
+     * @param[out] output_ptr   Pointer to the output tensor
+     * @param[in]  input_stride Stride of the input tensor
+     */
+    using NonMaxSuppr3x3Function = void(const void *__restrict input_ptr, void *__restrict output_ptr, const uint32_t input_stride);
+
+    NonMaxSuppr3x3Function *_func;   /**< Non-Maxima suppression function to use for the particular tensor types passed to configure() */
+    const ITensor          *_input;  /**< Source tensor */
+    ITensor                *_output; /**< Destination tensor */
+};
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+/** NEON kernel to perform Non-Maxima suppression 3x3 with intermediate results in F16 if the input data type is F32
+ */
+class NENonMaximaSuppression3x3FP16Kernel : public NENonMaximaSuppression3x3Kernel
+{
+public:
+    /** Initialise the kernel's sources, destinations and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8/F32.
+     * @param[out] output           Destination tensor. Data types supported: same as @p input
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+};
+#else
+using NENonMaximaSuppression3x3FP16Kernel = NENonMaximaSuppression3x3Kernel;
+#endif
+}
+#endif /* _ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
new file mode 100644
index 0000000..d4e36d5
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h

@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H__
+#define __ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the normalization layer kernel.
+ */
+class NENormalizationLayerKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NENormalizationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENormalizationLayerKernel(const NENormalizationLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENormalizationLayerKernel &operator=(const NENormalizationLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    NENormalizationLayerKernel(NENormalizationLayerKernel &&) = default;
+    /** Default move assignment operator. */
+    NENormalizationLayerKernel &operator=(NENormalizationLayerKernel &&) = default;
+    /** Default destructor */
+    ~NENormalizationLayerKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                           and an optional 4th dimension for batch of inputs. Data types supported: QS8/F32.
+     * @param[in]  input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                           Data type supported: same as @p input
+     * @param[out] output        Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     * @param[in]  norm_info     Normalization layer information like the normalization type, normalization size and other parameters.
+     */
+    void configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Function to perform normalization depending on the given template
+     *  dimension. The second template parameter specifies whether the
+     *  normalization has to be 1D or 2D.
+     *
+     * @note Only supported normalizations are:
+     *  - 1D over X or Z
+     *  - 2D over X and Y
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    template <unsigned int dim, bool do_2D_norm>
+    void normalize(const Window &window);
+
+    /** Function to perform normalization for fixed-point values depending on
+     * the given template dimension. The second template parameter specifies
+     * whether the normalization has to be 1D or 2D.
+     *
+     * @note Only supported normalizations are:
+     *  - 1D over X or Z
+     *  - 2D over X and Y
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    template <unsigned int dim, bool do_2D_norm>
+    void normalize_fixed_point(const Window &window);
+    /** Common signature for all the specialised normalization functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using NormalizationFunction = void (NENormalizationLayerKernel::*)(const Window &window);
+
+private:
+    NormalizationFunction  _func;
+    const ITensor         *_input;
+    const ITensor         *_input_squared;
+    ITensor               *_output;
+    NormalizationLayerInfo _norm_info;
+    BorderSize             _border_size;
+};
+}
+#endif /*__ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
new file mode 100644
index 0000000..7e402cd
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h

@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEPIXELWISEMULTIPLICATIONKERNEL_H__
+#define __ARM_COMPUTE_NEPIXELWISEMULTIPLICATIONKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform addition between two tensors */
+class NEPixelWiseMultiplicationKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEPixelWiseMultiplicationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPixelWiseMultiplicationKernel(const NEPixelWiseMultiplicationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPixelWiseMultiplicationKernel &operator=(const NEPixelWiseMultiplicationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEPixelWiseMultiplicationKernel(NEPixelWiseMultiplicationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEPixelWiseMultiplicationKernel &operator=(NEPixelWiseMultiplicationKernel &&) = default;
+    /** Default destructor */
+    ~NEPixelWiseMultiplicationKernel() = default;
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
+     *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
+     *
+     * @param[in]  input1          An input tensor. Data types supported: U8/QS8/S16/F32.
+     * @param[in]  input2          An input tensor. Data types supported: U8/QS8/S16/F32.
+     * @param[out] output          The output tensor. Data types supported: U8 (Only if both inputs are U8) /S16/F32.
+     * @param[in]  scale           Scale to apply after multiplication.
+     *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+     * @param[in]  overflow_policy Overflow policy.
+     * @param[in]  rounding_policy Rounding policy.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the specialised multiplication functions with integer scaling factor
+     *
+     * @param[in]  input1_ptr Pointer to the first input tensor.
+     * @param[in]  input2_ptr Pointer to the second input tensor.
+     * @param[out] output_ptr Pointer to the output tensor.
+     */
+    using MulFunctionInt = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int scale);
+    /** Common signature for all the specialised multiplication functions with fixed-point values
+     *
+     * @param[in]  input1_ptr           Pointer to the first input tensor.
+     * @param[in]  input2_ptr           Pointer to the second input tensor.
+     * @param[in]  scale                Scaling factor.
+     * @param[in]  fixed_point_position Fixed-point position that expresses the number of bits for the fractional part of the number.
+     * @param[out] output_ptr           Pointer to the output tensor.
+     */
+    using MulFunctionQInt = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int scale, int fixed_point_position);
+    /** Common signature for all the specialised multiplication functions with float scaling factor
+     *
+     * @param[in]  input1_ptr Pointer to the first input tensor.
+     * @param[in]  input2_ptr Pointer to the second input tensor.
+     * @param[out] output_ptr Pointer to the output tensor.
+     */
+    using MulFunctionFloat = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale);
+
+    MulFunctionFloat *_func_float;
+    MulFunctionInt   *_func_int;
+    MulFunctionQInt *_func_q_int;
+
+private:
+    const ITensor *_input1;
+    const ITensor *_input2;
+    ITensor       *_output;
+    float          _scale;
+    int            _scale_exponent;
+};
+}
+#endif /*__ARM_COMPUTE_NEPIXELWISEMULTIPLICATIONKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
new file mode 100644
index 0000000..62a0878
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h

@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H__
+#define __ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the pooling layer kernel */
+class NEPoolingLayerKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEPoolingLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPoolingLayerKernel(const NEPoolingLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPoolingLayerKernel &operator=(const NEPoolingLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEPoolingLayerKernel(NEPoolingLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEPoolingLayerKernel &operator=(NEPoolingLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEPoolingLayerKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. Data types supported: QS8/F32.
+     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     */
+    void configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Function to perform 2x2 pooling.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    template <PoolingType pooling_type>
+    void pooling2_f32(const Window &window_input, const Window &window);
+    /** Function to perform 2x2 pooling for 8bit fixed point.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    template <PoolingType pooling_type>
+    void pooling2_q8(const Window &window_input, const Window &window);
+    /** Function to perform 3x3 pooling.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    template <PoolingType pooling_type>
+    void pooling3_f32(const Window &window_input, const Window &window);
+    /** Function to perform 3x3 pooling for 8bit fixed point.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    template <PoolingType pooling_type>
+    void pooling3_q8(const Window &window_input, const Window &window);
+    /** Common signature for all the specialised Pooling functions
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    using PoolingFunction = void (NEPoolingLayerKernel::*)(const Window &window_input, const Window &window);
+
+private:
+    PoolingFunction  _func;
+    const ITensor   *_input;
+    ITensor         *_output;
+    PoolingLayerInfo _pool_info;
+    int              _num_elems_processed_per_iteration;
+    BorderSize       _border_size;
+};
+}
+#endif /*__ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NERemapKernel.h b/arm_compute/core/NEON/kernels/NERemapKernel.h
new file mode 100644
index 0000000..f9eae68
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NERemapKernel.h

@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEREMAPKERNEL_H__
+#define __ARM_COMPUTE_NEREMAPKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform a remap on a tensor */
+class NERemapKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NERemapKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NERemapKernel(const NERemapKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NERemapKernel &operator=(const NERemapKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NERemapKernel(NERemapKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NERemapKernel &operator=(NERemapKernel &&) = default;
+    /** Default destructor */
+    ~NERemapKernel() = default;
+
+    /** Initialize the kernel's input, output and border mode.
+     *
+     * @param[in]  input  Source tensor. Data type supported: U8.
+     * @param[in]  map_x  Map for X coordinates. Data type supported: F32.
+     * @param[in]  map_y  Map for Y coordinates. Data type supported: F32.
+     * @param[out] output Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
+     * @param[in]  policy The interpolation type.
+     */
+    void configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** function to perform nearest interpolation on the given window */
+    void remap_nearest(const Window &window);
+    /** function to perform bilinear interpolation on the given window */
+    void remap_bilinear(const Window &window);
+    /** Remap function to use for the particular interpolation type passed to configure() */
+    void (NERemapKernel::*_func)(const Window &window);
+
+    const ITensor *_input;  /**< Input image */
+    ITensor       *_output; /**< Output image */
+    const ITensor *_map_x;  /**< Input remap x coordinates */
+    const ITensor *_map_y;  /**< Input remap y coordinates */
+};
+}
+#endif /*__ARM_COMPUTE_NEREMAPKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEScaleKernel.h b/arm_compute/core/NEON/kernels/NEScaleKernel.h
new file mode 100644
index 0000000..03e2652
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEScaleKernel.h

@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESCALEKERNEL_H__
+#define __ARM_COMPUTE_NESCALEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform scaling on a tensor */
+class NEScaleKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEScaleKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEScaleKernel(const NEScaleKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEScaleKernel &operator=(const NEScaleKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEScaleKernel(NEScaleKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEScaleKernel &operator=(NEScaleKernel &&) = default;
+    /** Default destructor */
+    ~NEScaleKernel() = default;
+
+    /** Initialise the kernel's inputs, output and interpolation policy
+     *
+     * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8/S16.
+     * @param[in]  dx               Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32
+     * @param[in]  dy               Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32
+     * @param[in]  offsets          Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32.
+     * @param[out] output           Destination tensor. Data types supported: U8/S16. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]  policy           Interpolation type to use
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, ITensor *output, InterpolationPolicy policy, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    /** function to perform scale using nearest interpolation on the given window */
+    void scale_nearest(const Window &window);
+    /** function to perform scale using bilinear interpolation on the given window */
+    void scale_bilinear(const Window &window);
+    /** function to perform scale using area interpolation on the given window
+     *
+     *  @note Used only in case down-sampling.
+     */
+    void scale_area(const Window &window);
+    /** Scale function to use for the particular interpolation type passed to configure() */
+    void (NEScaleKernel::*_func)(const Window &window);
+
+    const ITensor *_offsets;
+    const ITensor *_dx;
+    const ITensor *_dy;
+    const ITensor *_input;
+    ITensor       *_output;
+};
+}
+#endif /*__ARM_COMPUTE_NESCALEKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h b/arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h
new file mode 100644
index 0000000..c618456
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h

@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESCHARR3x3KERNEL_H__
+#define __ARM_COMPUTE_NESCHARR3x3KERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to run a 3x3 Scharr filter on a tensor.
+ *
+* @f[
+*      \mathbf{G}_x=\begin{vmatrix}
+*      -3 & 0 & +3\\
+*      -10& 0 & +10\\
+*      -3 & 0 & +3
+*      \end{vmatrix}
+* @f]
+*/
+class NEScharr3x3Kernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEScharr3x3Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEScharr3x3Kernel(const NEScharr3x3Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEScharr3x3Kernel &operator=(const NEScharr3x3Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEScharr3x3Kernel(NEScharr3x3Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEScharr3x3Kernel &operator=(NEScharr3x3Kernel &&) = default;
+    /** Default destructor */
+    ~NEScharr3x3Kernel() = default;
+
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    bool           _run_scharr_x; /**< Do we need to run Scharr X ? */
+    bool           _run_scharr_y; /**< Do we need to run Scharr Y ? */
+    const ITensor *_input;        /**< Input tensor */
+    ITensor       *_output_x;     /**< Output tensor for scharr X */
+    ITensor       *_output_y;     /**< Output tensor for scharr Y */
+};
+}
+#endif /*__ARM_COMPUTE_NESCHARR3x3KERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NESobel3x3Kernel.h b/arm_compute/core/NEON/kernels/NESobel3x3Kernel.h
new file mode 100644
index 0000000..246dd83
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NESobel3x3Kernel.h

@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESOBEL3x3KERNEL_H__
+#define __ARM_COMPUTE_NESOBEL3x3KERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to run a 3x3 Sobel X filter on a tensor.
+ *
+ * @f[
+ *      \mathbf{G}_x=\begin{vmatrix}
+ *      -1 & 0 & +1\\
+ *      -2 & 0 & +2\\
+ *      -1 & 0 & +1
+ *      \end{vmatrix}
+ * @f]
+*/
+class NESobel3x3Kernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NESobel3x3Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel3x3Kernel(const NESobel3x3Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel3x3Kernel &operator=(const NESobel3x3Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NESobel3x3Kernel(NESobel3x3Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NESobel3x3Kernel &operator=(NESobel3x3Kernel &&) = default;
+    /** Default destructor */
+    ~NESobel3x3Kernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    bool           _run_sobel_x; /**< Do we need to run Sobel X ? */
+    bool           _run_sobel_y; /**< Do we need to run Sobel Y ? */
+    const ITensor *_input;       /**< Input tensor */
+    ITensor       *_output_x;    /**< Output tensor for sobel X */
+    ITensor       *_output_y;    /**< Output tensor for sobel Y */
+};
+}
+#endif /*__ARM_COMPUTE_NESOBEL3x3KERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NESobel5x5Kernel.h b/arm_compute/core/NEON/kernels/NESobel5x5Kernel.h
new file mode 100644
index 0000000..49c1c41
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NESobel5x5Kernel.h

@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESOBEL5x5KERNEL_H__
+#define __ARM_COMPUTE_NESOBEL5x5KERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to run the horizontal pass of 5x5 Sobel filter on a tensor.
+ *
+ */
+class NESobel5x5HorKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NESobel5x5HorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel5x5HorKernel(const NESobel5x5HorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel5x5HorKernel &operator=(const NESobel5x5HorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NESobel5x5HorKernel(NESobel5x5HorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NESobel5x5HorKernel &operator=(NESobel5x5HorKernel &&) = default;
+    /** Default destructor */
+    ~NESobel5x5HorKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @note At least one of output_x or output_y must be set
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    const ITensor *_input;       /**< Input tensor */
+    ITensor       *_output_x;    /**< X output of horizontal pass */
+    ITensor       *_output_y;    /**< Y output of horizontal pass */
+    bool           _run_sobel_x; /**< Do we need to run Sobel X? */
+    bool           _run_sobel_y; /**< Do we need to run Sobel Y? */
+    BorderSize     _border_size; /**< Border size */
+};
+
+/** Interface for the kernel to run the vertical pass of 5x5 Sobel Y filter on a tensor.
+ *
+*/
+class NESobel5x5VertKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NESobel5x5VertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel5x5VertKernel(const NESobel5x5VertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel5x5VertKernel &operator=(const NESobel5x5VertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NESobel5x5VertKernel(NESobel5x5VertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NESobel5x5VertKernel &operator=(NESobel5x5VertKernel &&) = default;
+    /** Default destructor */
+    ~NESobel5x5VertKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  input_x          Input for X (X output of hor pass). Data type supported: S16.
+     * @param[in]  input_y          Input for Y (Y output of hor pass). Data type supported: S16.
+     * @param[out] output_x         Destination tensor for the X gradient. Data type supported: S16.
+     * @param[out] output_y         Destination tensor for the Y gradient. Data type supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(ITensor *input_x, ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    ITensor *_input_x;     /**< X input (X output of the hor pass) */
+    ITensor *_input_y;     /**< Y input (Y output of the hor pass) */
+    ITensor *_output_x;    /**< X output of sobel */
+    ITensor *_output_y;    /**< Y output of sobel */
+    bool     _run_sobel_x; /**< Do we need to run sobel X? */
+    bool     _run_sobel_y; /**< Do we need to run sobel Y? */
+};
+}
+#endif /*__ARM_COMPUTE_NESOBEL5x5KERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NESobel7x7Kernel.h b/arm_compute/core/NEON/kernels/NESobel7x7Kernel.h
new file mode 100644
index 0000000..4bff859
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NESobel7x7Kernel.h

@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESOBEL7x7KERNEL_H__
+#define __ARM_COMPUTE_NESOBEL7x7KERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to run the horizontal pass of 7x7 Sobel filter on a tensor.
+ *
+ */
+class NESobel7x7HorKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NESobel7x7HorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel7x7HorKernel(const NESobel7x7HorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel7x7HorKernel &operator=(const NESobel7x7HorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NESobel7x7HorKernel(NESobel7x7HorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NESobel7x7HorKernel &operator=(NESobel7x7HorKernel &&) = default;
+    /** Default destructor */
+    ~NESobel7x7HorKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S32.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S32.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    const ITensor *_input;       /**< Input tensor */
+    ITensor       *_output_x;    /**< X output of horizontal pass */
+    ITensor       *_output_y;    /**< Y output of horizontal pass */
+    bool           _run_sobel_x; /**< Do we need to run Sobel X? */
+    bool           _run_sobel_y; /**< Do we need to run Sobel Y? */
+    BorderSize     _border_size; /**< Border size */
+};
+
+/** Interface for the kernel to run the vertical pass of 7x7 Sobel Y filter on a tensor.
+ *
+*/
+class NESobel7x7VertKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NESobel7x7VertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel7x7VertKernel(const NESobel7x7VertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel7x7VertKernel &operator=(const NESobel7x7VertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NESobel7x7VertKernel(NESobel7x7VertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NESobel7x7VertKernel &operator=(NESobel7x7VertKernel &&) = default;
+    /** Default destructor */
+    ~NESobel7x7VertKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @note At least one of output_x or output_y must be set
+     * @note If output_x is set then input_x must be set too
+     * @note If output_y is set then input_y must be set too
+     *
+     * @param[in]  input_x          (Optional) Input for X (X output of hor pass). Data type supported: S32.
+     * @param[in]  input_y          (Optional) Input for Y (Y output of hor pass). Data type supported: S32.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S32.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S32.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input_x, const ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    const ITensor *_input_x;     /**< X input (X output of the hor pass) */
+    const ITensor *_input_y;     /**< Y input (Y output of the hor pass) */
+    ITensor       *_output_x;    /**< X output of sobel */
+    ITensor       *_output_y;    /**< Y output of sobel */
+    bool           _run_sobel_x; /**< Do we need to run sobel X? */
+    bool           _run_sobel_y; /**< Do we need to run sobel Y? */
+};
+}
+#endif /*__ARM_COMPUTE_NESOBEL7x7KERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
new file mode 100644
index 0000000..ab626ad
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h

@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H__
+#define __ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the identifying the max value of 1D Logits */
+class NELogits1DMaxKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NELogits1DMaxKernel();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: QS8, F32.
+     * @param[out] output Destination tensor. Data types supported: same as @p input
+     */
+    void configure(const ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    using Logits1DMaxFunction = void(const ITensor *in, ITensor *out, const Window &window);
+
+private:
+    Logits1DMaxFunction *_func;
+    BorderSize           _border_size;
+};
+
+/** Interface for shifting the logits values around the max value and exponentiating the result */
+class NELogits1DShiftExpSumKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NELogits1DShiftExpSumKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELogits1DShiftExpSumKernel(const NELogits1DShiftExpSumKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELogits1DShiftExpSumKernel &operator=(const NELogits1DShiftExpSumKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NELogits1DShiftExpSumKernel(NELogits1DShiftExpSumKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NELogits1DShiftExpSumKernel &operator=(NELogits1DShiftExpSumKernel &&) = default;
+    /** Default destructor */
+    ~NELogits1DShiftExpSumKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: QS8, F32.
+     * @param[in]  max    Max values tensor. Data types supported: same as @p input.
+     * @param[out] output Destination tensor. Data types supported: same as @p input.
+     * @param[out] sum    Sum of 1D logits tensor. Data types supported: same as @p input.
+     */
+    void configure(const ITensor *input, const ITensor *max, ITensor *output, ITensor *sum);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    using Logits1DShiftExpSumFunction = void(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window);
+
+private:
+    Logits1DShiftExpSumFunction *_func;
+    const ITensor               *_input;
+    const ITensor               *_max;
+    ITensor                     *_output;
+    ITensor                     *_sum;
+};
+
+/** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */
+class NELogits1DNormKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NELogits1DNormKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELogits1DNormKernel(const NELogits1DNormKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELogits1DNormKernel &operator=(const NELogits1DNormKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NELogits1DNormKernel(NELogits1DNormKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NELogits1DNormKernel &operator=(NELogits1DNormKernel &&) = default;
+    /** Default destructor */
+    ~NELogits1DNormKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: QS8, F32.
+     * @param[in]  sum    Sum tensor. The number of dimensions should be dim(input)-1. Data types supported: same as @p input.
+     * @param[out] output Destination tensor. Data types supported: same as @p input.
+     */
+    void configure(const ITensor *input, const ITensor *sum, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    using Logits1DNormFunction = void(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window);
+
+private:
+    Logits1DNormFunction *_func;
+    const ITensor        *_input;
+    const ITensor        *_sum;
+    ITensor              *_output;
+};
+}
+#endif /*__ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NETableLookupKernel.h b/arm_compute/core/NEON/kernels/NETableLookupKernel.h
new file mode 100644
index 0000000..b3963e5
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NETableLookupKernel.h

@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NETABLELOOKUPKERNEL_H__
+#define __ARM_COMPUTE_NETABLELOOKUPKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+class ILut;
+
+/** Interface for the kernel to perform table lookup calculations. */
+class NETableLookupKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NETableLookupKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NETableLookupKernel(const NETableLookupKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NETableLookupKernel &operator=(const NETableLookupKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NETableLookupKernel(NETableLookupKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NETableLookupKernel &operator=(NETableLookupKernel &&) = default;
+    /** Initialise the kernel's input, lut and output.
+     *
+     * @param[in]  input  An input tensor. Data types supported: U8/S16.
+     * @param[in]  lut    The input LUT.
+     * @param[out] output The output tensor. Data types supported: same as @p input
+     */
+    void configure(const ITensor *input, const ILut *lut, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Perform table lookup on a given window.
+     *
+     * @param window window Region on which to execute the kernel.
+     */
+    template <class T>
+    void tableLookup(const Window &window);
+    /** Common signature for all the specialised lut functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using TableLookupFunction = void (NETableLookupKernel::*)(const Window &window);
+    /** Sub function to use for the particular tensor types passed to configure() */
+    TableLookupFunction _func;
+    const ILut         *_lut;
+};
+}
+#endif /* __ARM_COMPUTE_NETABLELOOKUPKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEThresholdKernel.h b/arm_compute/core/NEON/kernels/NEThresholdKernel.h
new file mode 100644
index 0000000..7781762
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEThresholdKernel.h

@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NETHRESHOLDKERNEL_H__
+#define __ARM_COMPUTE_NETHRESHOLDKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the thresholding kernel
+ *
+ */
+class NEThresholdKernel : public INEKernel
+{
+public:
+    /** Constructor
+     * Initialize all the pointers to nullptr and parameters to zero.
+     */
+    NEThresholdKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEThresholdKernel(const NEThresholdKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEThresholdKernel &operator=(const NEThresholdKernel &) = delete;
+    /** Initialise the kernel's input, output and threshold parameters.
+     *
+     * @param[in]  input       An input tensor. Data type supported: U8
+     * @param[out] output      The output tensor. Data type supported: U8.
+     * @param[in]  threshold   Threshold. When the threhold type is RANGE, this is used as the lower threshold.
+     * @param[in]  false_value value to set when the condition is not respected.
+     * @param[in]  true_value  value to set when the condition is respected.
+     * @param[in]  type        Thresholding type. Either RANGE or BINARY.
+     * @param[in]  upper       Upper threshold. Only used when the thresholding type is RANGE.
+     */
+    void configure(const ITensor *input, ITensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** run binary thresholding on the given window */
+    void run_binary(const Window &window);
+    /** run range thresholding on the given window */
+    void run_range(const Window &window);
+
+    void (NEThresholdKernel::*_func)(const Window &window);
+
+    const ITensor *_input;  /**< Input */
+    ITensor       *_output; /**< Output */
+    uint8_t        _threshold;
+    uint8_t        _false_value;
+    uint8_t        _true_value;
+    uint8_t        _upper;
+};
+}
+#endif /*__ARM_COMPUTE_NETHRESHOLDKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NETransposeKernel.h b/arm_compute/core/NEON/kernels/NETransposeKernel.h
new file mode 100644
index 0000000..ac9449f
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NETransposeKernel.h

@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NETRANSPOSEKERNEL_H__
+#define __ARM_COMPUTE_NETRANSPOSEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel which transposes the elements of a matrix.
+ *
+ * [width, height, batch] -> [height, width, batch]
+ *
+ */
+class NETransposeKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NETransposeKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NETransposeKernel(const NETransposeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NETransposeKernel &operator=(const NETransposeKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NETransposeKernel(NETransposeKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NETransposeKernel &operator=(NETransposeKernel &&) = default;
+    /** Default destructor */
+    ~NETransposeKernel() = default;
+
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
+     * @param[out] output Output tensor. Data type supported: Same as @p input
+     */
+    void configure(const ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the transpose functions
+     *
+     * @param[in]  input  An input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
+     * @param[out] output The output tensor. Data type supported: same as @p input
+     * @param[in]  window Region on which to execute the kernel.
+     */
+    using TransposeFunction = void(const ITensor *input, ITensor *output, const Window &window);
+    /** Transpose function to use for the particular tensor types passed to configure() */
+    TransposeFunction *_func;
+    const ITensor     *_input;
+    ITensor           *_output;
+};
+}
+#endif /* __ARM_COMPUTE_NETRANSPOSEKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEWarpKernel.h b/arm_compute/core/NEON/kernels/NEWarpKernel.h
new file mode 100644
index 0000000..10fed1d
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEWarpKernel.h

@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEWARPKERNEL_H__
+#define __ARM_COMPUTE_NEWARPKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Common interface for warp affine and warp perspective */
+class INEWarpKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    INEWarpKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INEWarpKernel(const INEWarpKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INEWarpKernel &operator=(const INEWarpKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    INEWarpKernel(INEWarpKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    INEWarpKernel &operator=(INEWarpKernel &&) = default;
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input                 Source tensor. Data type supported: U8.
+     * @param[out] output                Destination tensor. Data type supported: U8.
+     * @param[in]  matrix                The perspective or affine matrix to use. Must be 2x3 for affine and 3x3 for perspective of type float.
+     * @param[in]  border_mode           Strategy to use for borders
+     * @param[in]  constant_border_value Constant value used for filling the border.
+     */
+    virtual void configure(const ITensor *input, ITensor *output, const float *matrix, BorderMode border_mode, uint8_t constant_border_value);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+protected:
+    /** function to perform warp affine or warp perspective on the given window when border mode == UNDEFINED
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    virtual void warp_undefined(const Window &window) = 0;
+    /** function to perform warp affine or warp perspective on the given window when border mode == CONSTANT
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    virtual void warp_constant(const Window &window) = 0;
+    /** function to perform warp affine or warp perspective on the given window when border mode == REPLICATE
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    virtual void warp_replicate(const Window &window) = 0;
+    /** Common signature for all the specialised warp functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    void (INEWarpKernel::*_func)(const Window &window);
+
+    const ITensor *_input;                 /**< Input Tensor */
+    ITensor       *_output;                /**< Output Tensor */
+    uint8_t        _constant_border_value; /**< Constant value used for filling the border. This value is used for those pixels out of the ROI when the border mode is CONSTANT */
+    const float   *_matrix;                /**< The affine or perspective matrix. Must be 2x3 for warp affine or 3x3 for warp perspective of type float. */
+};
+
+/** Template interface for the kernel to compute warp affine
+ *
+ */
+template <InterpolationPolicy interpolation>
+class NEWarpAffineKernel : public INEWarpKernel
+{
+private:
+    // Inherited methods overridden:
+    void warp_undefined(const Window &window) override;
+    void warp_constant(const Window &window) override;
+    void warp_replicate(const Window &window) override;
+};
+
+/** Template interface for the kernel to compute warp perspective
+ *
+ */
+template <InterpolationPolicy interpolation>
+class NEWarpPerspectiveKernel : public INEWarpKernel
+{
+private:
+    // Inherited methods overridden:
+    void warp_undefined(const Window &window) override;
+    void warp_constant(const Window &window) override;
+    void warp_replicate(const Window &window) override;
+};
+}
+#endif /*__ARM_COMPUTE_NEWARPKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
new file mode 100644
index 0000000..cad2d00
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h

@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H__
+#define __ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform reshaping on the weights used by convolution and locally connected layer
+ *
+ * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
+ * In combination with the @ref NEIm2ColKernel can transform a convolution to a matrix multiplication.
+ *
+ * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have:
+ * @f[
+ * \left( \begin{array}{ccc}
+ * a000 & a001 & a002 \\
+ * a010 & a011 & a012 \\
+ * a020 & a021 & a022 \\
+ * \end{array} \right)
+ * \left( \begin{array}{ccc}
+ * a100 & a101 & a102 \\
+ * a110 & a111 & a112 \\
+ * a120 & a121 & a122 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccc}
+ * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\
+ * \end{array} \right)
+ * @f]
+ */
+class NEWeightsReshapeKernel : public INEKernel
+{
+public:
+    /** Constructor.*/
+    NEWeightsReshapeKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEWeightsReshapeKernel(const NEWeightsReshapeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEWeightsReshapeKernel &operator=(const NEWeightsReshapeKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEWeightsReshapeKernel(NEWeightsReshapeKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEWeightsReshapeKernel &operator=(NEWeightsReshapeKernel &&) = default;
+    /** Default destructor */
+    ~NEWeightsReshapeKernel() = default;
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
+     *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QS8/F32
+     * @param[in]  bias   The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
+     *                    dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
+     * @param[out] output The output tensor. Data types supported: Same as @p input
+     */
+    void configure(const ITensor *input, const ITensor *bias, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    using WeightsReshapeKernel = void(const ITensor *input, const ITensor *bias, ITensor *output, const Window &window);
+
+    WeightsReshapeKernel *_func;
+    const ITensor        *_input;
+    const ITensor        *_bias;
+    ITensor              *_output;
+};
+}
+
+#endif /*__ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H__ */
commit	6ff3b19ee6120edf015fad8caab2991faa3070af	[log] [tgz]
author	Anthony Barbier <anthony.barbier@arm.com>	Mon Sep 04 18:44:23 2017 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	Mon Sep 17 13:03:09 2018 +0100
tree	a7a6dcd16dfd56d79fa1b56a313caeebcc939b68