COMPMID-3638: Move NEON kernels

Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Change-Id: Ieed3e4bc8be7fef80c90c5094599b477a56fc473
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4285
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp b/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp
index d7af9c9..a134e3e 100644
--- a/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp
+++ b/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp
@@ -84,14 +84,14 @@
 
     update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
 
-    INEKernel::configure(win);
+    ICPPKernel::configure(win);
 }
 
 void CPPCornerCandidatesKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
     Iterator input(_input, window);
 
     execute_window_loop(window, [&](const Coordinates & id)
diff --git a/src/core/NEON/INEKernel.h b/src/core/NEON/INEKernel.h
new file mode 100644
index 0000000..7ad2016
--- /dev/null
+++ b/src/core/NEON/INEKernel.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_INEKERNEL_H
+#define ARM_COMPUTE_INEKERNEL_H
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+
+namespace arm_compute
+{
+/** Common interface for all kernels implemented in NEON. */
+using INEKernel = ICPPKernel;
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_INEKERNEL_H */
diff --git a/src/core/NEON/INESimpleKernel.h b/src/core/NEON/INESimpleKernel.h
new file mode 100644
index 0000000..da32d66
--- /dev/null
+++ b/src/core/NEON/INESimpleKernel.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_INESIMPLEKERNEL_H
+#define ARM_COMPUTE_INESIMPLEKERNEL_H
+
+#include "arm_compute/core/CPP/ICPPSimpleKernel.h"
+
+namespace arm_compute
+{
+/** Interface for simple NEON kernels having 1 tensor input and 1 tensor output */
+using INESimpleKernel = ICPPSimpleKernel;
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_INESIMPLEKERNEL_H */
diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h
new file mode 100644
index 0000000..c1924d6
--- /dev/null
+++ b/src/core/NEON/NEKernels.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEKERNELS_H
+#define ARM_COMPUTE_NEKERNELS_H
+
+/* Header regrouping all the NEON kernels */
+#include "src/core/NEON/kernels/NEAbsoluteDifferenceKernel.h"
+#include "src/core/NEON/kernels/NEAccumulateKernel.h"
+#include "src/core/NEON/kernels/NEActivationLayerKernel.h"
+#include "src/core/NEON/kernels/NEArithmeticAdditionKernel.h"
+#include "src/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
+#include "src/core/NEON/kernels/NEBatchConcatenateLayerKernel.h"
+#include "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
+#include "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h"
+#include "src/core/NEON/kernels/NEBitwiseAndKernel.h"
+#include "src/core/NEON/kernels/NEBitwiseNotKernel.h"
+#include "src/core/NEON/kernels/NEBitwiseOrKernel.h"
+#include "src/core/NEON/kernels/NEBitwiseXorKernel.h"
+#include "src/core/NEON/kernels/NEBoundingBoxTransformKernel.h"
+#include "src/core/NEON/kernels/NEBox3x3Kernel.h"
+#include "src/core/NEON/kernels/NECannyEdgeKernel.h"
+#include "src/core/NEON/kernels/NEChannelCombineKernel.h"
+#include "src/core/NEON/kernels/NEChannelExtractKernel.h"
+#include "src/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
+#include "src/core/NEON/kernels/NECol2ImKernel.h"
+#include "src/core/NEON/kernels/NEColorConvertKernel.h"
+#include "src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
+#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+#include "src/core/NEON/kernels/NEConvolutionKernel.h"
+#include "src/core/NEON/kernels/NECopyKernel.h"
+#include "src/core/NEON/kernels/NECropKernel.h"
+#include "src/core/NEON/kernels/NECumulativeDistributionKernel.h"
+#include "src/core/NEON/kernels/NEDepthConcatenateLayerKernel.h"
+#include "src/core/NEON/kernels/NEDepthConvertLayerKernel.h"
+#include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
+#include "src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
+#include "src/core/NEON/kernels/NEDequantizationLayerKernel.h"
+#include "src/core/NEON/kernels/NEDerivativeKernel.h"
+#include "src/core/NEON/kernels/NEDilateKernel.h"
+#include "src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
+#include "src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEElementwiseOperationKernel.h"
+#include "src/core/NEON/kernels/NEElementwiseUnaryKernel.h"
+#include "src/core/NEON/kernels/NEErodeKernel.h"
+#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
+#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
+#include "src/core/NEON/kernels/NEFFTScaleKernel.h"
+#include "src/core/NEON/kernels/NEFastCornersKernel.h"
+#include "src/core/NEON/kernels/NEFillArrayKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEFlattenLayerKernel.h"
+#include "src/core/NEON/kernels/NEFloorKernel.h"
+#include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
+#include "src/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "src/core/NEON/kernels/NEGatherKernel.h"
+#include "src/core/NEON/kernels/NEGaussian3x3Kernel.h"
+#include "src/core/NEON/kernels/NEGaussian5x5Kernel.h"
+#include "src/core/NEON/kernels/NEGaussianPyramidKernel.h"
+#include "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
+#include "src/core/NEON/kernels/NEHOGDescriptorKernel.h"
+#include "src/core/NEON/kernels/NEHOGDetectorKernel.h"
+#include "src/core/NEON/kernels/NEHarrisCornersKernel.h"
+#include "src/core/NEON/kernels/NEHeightConcatenateLayerKernel.h"
+#include "src/core/NEON/kernels/NEHistogramKernel.h"
+#include "src/core/NEON/kernels/NEIm2ColKernel.h"
+#include "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
+#include "src/core/NEON/kernels/NEIntegralImageKernel.h"
+#include "src/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
+#include "src/core/NEON/kernels/NELKTrackerKernel.h"
+#include "src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEMagnitudePhaseKernel.h"
+#include "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h"
+#include "src/core/NEON/kernels/NEMeanStdDevKernel.h"
+#include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
+#include "src/core/NEON/kernels/NEMedian3x3Kernel.h"
+#include "src/core/NEON/kernels/NEMemsetKernel.h"
+#include "src/core/NEON/kernels/NEMinMaxLayerKernel.h"
+#include "src/core/NEON/kernels/NEMinMaxLocationKernel.h"
+#include "src/core/NEON/kernels/NENonLinearFilterKernel.h"
+#include "src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
+#include "src/core/NEON/kernels/NENormalizationLayerKernel.h"
+#include "src/core/NEON/kernels/NEPadLayerKernel.h"
+#include "src/core/NEON/kernels/NEPermuteKernel.h"
+#include "src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
+#include "src/core/NEON/kernels/NEPoolingLayerKernel.h"
+#include "src/core/NEON/kernels/NEPriorBoxLayerKernel.h"
+#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
+#include "src/core/NEON/kernels/NEQuantizationLayerKernel.h"
+#include "src/core/NEON/kernels/NEROIAlignLayerKernel.h"
+#include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
+#include "src/core/NEON/kernels/NERangeKernel.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
+#include "src/core/NEON/kernels/NERemapKernel.h"
+#include "src/core/NEON/kernels/NEReorgLayerKernel.h"
+#include "src/core/NEON/kernels/NEReshapeLayerKernel.h"
+#include "src/core/NEON/kernels/NEReverseKernel.h"
+#include "src/core/NEON/kernels/NEScaleKernel.h"
+#include "src/core/NEON/kernels/NEScharr3x3Kernel.h"
+#include "src/core/NEON/kernels/NESelectKernel.h"
+#include "src/core/NEON/kernels/NESobel3x3Kernel.h"
+#include "src/core/NEON/kernels/NESobel5x5Kernel.h"
+#include "src/core/NEON/kernels/NESobel7x7Kernel.h"
+#include "src/core/NEON/kernels/NESoftmaxLayerKernel.h"
+#include "src/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
+#include "src/core/NEON/kernels/NESpaceToDepthLayerKernel.h"
+#include "src/core/NEON/kernels/NEStackLayerKernel.h"
+#include "src/core/NEON/kernels/NEStridedSliceKernel.h"
+#include "src/core/NEON/kernels/NETableLookupKernel.h"
+#include "src/core/NEON/kernels/NEThresholdKernel.h"
+#include "src/core/NEON/kernels/NETileKernel.h"
+#include "src/core/NEON/kernels/NETransposeKernel.h"
+#include "src/core/NEON/kernels/NEUpsampleLayerKernel.h"
+#include "src/core/NEON/kernels/NEWarpKernel.h"
+#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
+#include "src/core/NEON/kernels/NEWidthConcatenateLayerKernel.h"
+#include "src/core/NEON/kernels/NEYOLOLayerKernel.h"
+
+#endif /* ARM_COMPUTE_NEKERNELS_H */
diff --git a/src/core/NEON/NETracePoint.cpp b/src/core/NEON/NETracePoint.cpp
index 4a6bffa..bf48b41 100644
--- a/src/core/NEON/NETracePoint.cpp
+++ b/src/core/NEON/NETracePoint.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/core/TracePoint.h"
 
-#include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
+#include "src/core/NEON/kernels/NELKTrackerKernel.h"
 #include "src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
 #include "src/core/NEON/kernels/convolution/common/convolution.hpp"
 #include "utils/TypePrinter.h"
diff --git a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
index acea0af..a6a41b8 100644
--- a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
+++ b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h"
+#include "src/core/NEON/kernels/NEAbsoluteDifferenceKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.h b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.h
new file mode 100644
index 0000000..cc95172
--- /dev/null
+++ b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEABSOLUTEDIFFERENCEKERNEL_H
+#define ARM_COMPUTE_NEABSOLUTEDIFFERENCEKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the absolute difference kernel
+ *
+ * Absolute difference is computed by:
+ * @f[ output(x,y) = | input1(x,y) - input2(x,y) | @f]
+ */
+class NEAbsoluteDifferenceKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEAbsoluteDifferenceKernel";
+    }
+    /** Default constructor */
+    NEAbsoluteDifferenceKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAbsoluteDifferenceKernel(const NEAbsoluteDifferenceKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAbsoluteDifferenceKernel &operator=(const NEAbsoluteDifferenceKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEAbsoluteDifferenceKernel(NEAbsoluteDifferenceKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEAbsoluteDifferenceKernel &operator=(NEAbsoluteDifferenceKernel &&) = default;
+    /** Default destructor */
+    ~NEAbsoluteDifferenceKernel() = default;
+
+    /** Set the inputs and output tensors
+     *
+     * @param[in]  input1 Source tensor. Data types supported: U8/S16
+     * @param[in]  input2 Source tensor. Data types supported: U8/S16
+     * @param[out] output Destination tensor, Data types supported: U8/S16
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Common signature for all the specialised absolute difference functions
+     *
+     * @param[in]  input1 An input tensor. Data types supported: U8/S16.
+     * @param[in]  input2 An input tensor. Data types supported: U8/S16.
+     * @param[out] output The output tensor, Data types supported: U8 (Only if both inputs are U8), S16.
+     * @param[in]  window Region on which to execute the kernel.
+     */
+    using AbsDiffFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
+
+    /** Absolute difference function to use for the particular tensor formats passed to configure() */
+    AbsDiffFunction *_func;
+    const ITensor   *_input1;
+    const ITensor   *_input2;
+    ITensor         *_output;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEABSOLUTEDIFFERENCEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEAccumulateKernel.cpp b/src/core/NEON/kernels/NEAccumulateKernel.cpp
index 73ef7eb..46179ca 100644
--- a/src/core/NEON/kernels/NEAccumulateKernel.cpp
+++ b/src/core/NEON/kernels/NEAccumulateKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEAccumulateKernel.h"
+#include "src/core/NEON/kernels/NEAccumulateKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -33,13 +33,8 @@
 
 #include <arm_neon.h>
 
-using namespace arm_compute;
-
 namespace arm_compute
 {
-class Coordinates;
-} // namespace arm_compute
-
 /* Max S16 value used for saturation purposes. */
 const static uint16x8_t max_int_u16 = vdupq_n_u16(static_cast<uint16_t>(INT16_MAX));
 
@@ -361,3 +356,4 @@
     },
     input, accum);
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/NEON/kernels/NEAccumulateKernel.h b/src/core/NEON/kernels/NEAccumulateKernel.h
new file mode 100644
index 0000000..af1298f
--- /dev/null
+++ b/src/core/NEON/kernels/NEAccumulateKernel.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEACCUMULATEKERNEL_H
+#define ARM_COMPUTE_NEACCUMULATEKERNEL_H
+
+#include "src/core/NEON/INESimpleKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the accumulate kernel
+ *
+ * Accumulation is computed by:
+ * @f[ accum(x,y) = accum(x,y) + input(x,y) @f]
+ */
+class NEAccumulateKernel : public INESimpleKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEAccumulateKernel";
+    }
+    /** Default constructor */
+    NEAccumulateKernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAccumulateKernel(const NEAccumulateKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAccumulateKernel &operator=(const NEAccumulateKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEAccumulateKernel(NEAccumulateKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEAccumulateKernel &operator=(NEAccumulateKernel &&) = default;
+    /** Default destructor */
+    ~NEAccumulateKernel() = default;
+    /** Set the input and accumulation tensors
+     *
+     * @param[in]  input Source tensor. Data type supported: U8.
+     * @param[out] accum Destination tensor. Data type supported: S16.
+     */
+    void configure(const ITensor *input, ITensor *accum);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+};
+
+/** Interface for the accumulate weighted kernel
+ *
+ * Weighted accumulation is computed:
+ * @f[ accum(x,y) = (1 - \alpha)*accum(x,y) + \alpha*input(x,y) @f]
+ *
+ * Where @f$ 0 \le \alpha \le 1 @f$
+ * Conceptually, the rounding for this is defined as:
+ * @f[ output(x,y)= uint8( (1 - \alpha) * float32( int32( output(x,y) ) ) + \alpha * float32( int32( input(x,y) ) ) ) @f]
+*/
+class NEAccumulateWeightedKernel : public INESimpleKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEAccumulateWeightedKernel";
+    }
+    /** Default constructor */
+    NEAccumulateWeightedKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAccumulateWeightedKernel(const NEAccumulateWeightedKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAccumulateWeightedKernel &operator=(const NEAccumulateWeightedKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEAccumulateWeightedKernel(NEAccumulateWeightedKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEAccumulateWeightedKernel &operator=(NEAccumulateWeightedKernel &&) = default;
+    /** Default destructor */
+    ~NEAccumulateWeightedKernel() = default;
+    /** Set the input and accumulation tensors, and the scale value
+     *
+     * @param[in]     input Source tensor. Data type supported: U8.
+     * @param[in]     alpha Scalar value in the range [0.0f, 1.0f]
+     * @param[in,out] accum Accumulated tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input, float alpha, ITensor *accum);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+protected:
+    float _alpha;
+};
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+/** Interface for the accumulate weighted kernel using F16 */
+class NEAccumulateWeightedFP16Kernel : public NEAccumulateWeightedKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEAccumulateWeightedFP16Kernel";
+    }
+    /** Default constructor */
+    NEAccumulateWeightedFP16Kernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAccumulateWeightedFP16Kernel(const NEAccumulateWeightedFP16Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAccumulateWeightedFP16Kernel &operator=(const NEAccumulateWeightedFP16Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEAccumulateWeightedFP16Kernel(NEAccumulateWeightedFP16Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEAccumulateWeightedFP16Kernel &operator=(NEAccumulateWeightedFP16Kernel &&) = default;
+    /** Default destructor */
+    ~NEAccumulateWeightedFP16Kernel() = default;
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+};
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+/** Interface for the accumulate weighted kernel using F16 */
+using NEAccumulateWeightedFP16Kernel = NEAccumulateWeightedKernel;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+/** Interface for the accumulate squared kernel
+ *
+ * The accumulation of squares is computed:
+ * @f[ accum(x,y) = saturate_{int16} ( (uint16) accum(x,y) + (((uint16)(input(x,y)^2)) >> (shift)) ) @f]
+ *
+ * Where @f$ 0 \le shift \le 15 @f$
+*/
+class NEAccumulateSquaredKernel : public INESimpleKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEAccumulateSquaredKernel";
+    }
+    /** Default constructor */
+    NEAccumulateSquaredKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAccumulateSquaredKernel(const NEAccumulateSquaredKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAccumulateSquaredKernel &operator=(const NEAccumulateSquaredKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEAccumulateSquaredKernel(NEAccumulateSquaredKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEAccumulateSquaredKernel &operator=(NEAccumulateSquaredKernel &&) = default;
+    /** Default destructor */
+    ~NEAccumulateSquaredKernel() = default;
+    /** Set the input and accumulation tensors and the shift value.
+     *
+     * @param[in]     input Source tensor. Data type supported: U8.
+     * @param[in]     shift Shift value in the range of [0, 15]
+     * @param[in,out] accum Accumulated tensor. Data type supported: S16.
+     */
+    void configure(const ITensor *input, uint32_t shift, ITensor *accum);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    uint32_t _shift;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEACCUMULATEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index f61f048..51257cb 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
+#include "src/core/NEON/kernels/NEActivationLayerKernel.h"
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.h b/src/core/NEON/kernels/NEActivationLayerKernel.h
new file mode 100644
index 0000000..783783c
--- /dev/null
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H
+#define ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H
+
+#include "arm_compute/core/utils/misc/Traits.h"
+#include "src/core/NEON/INEKernel.h"
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#include <arm_fp16.h>
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Interface for the activation layer kernel. */
+class NEActivationLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEActivationLayerKernel";
+    }
+    /** Constructor */
+    NEActivationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEActivationLayerKernel(const NEActivationLayerKernel &) = delete;
+    /** Default move constructor */
+    NEActivationLayerKernel(NEActivationLayerKernel &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEActivationLayerKernel &operator=(const NEActivationLayerKernel &) = delete;
+    /** Default move assignment operator */
+    NEActivationLayerKernel &operator=(NEActivationLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEActivationLayerKernel() = default;
+    /** Set the input and output tensor.
+     *
+     * @note If the output tensor is a nullptr, the activation function will be performed in-place
+     *
+     * @param[in, out] input           Source tensor info. In case of @p output tensor = nullptr, this tensor will store the result
+     *                                 of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
+     * @param[out]     output          Destination tensor info. Data type supported: same as @p input
+     * @param[in]      activation_info Activation layer information.
+     */
+    void configure(const ITensorInfo *input, ITensorInfo *output, ActivationLayerInfo activation_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEActivationLayerKernel
+     *
+     * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
+     *                     of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
+     * @param[in] output   Destination tensor info. Data type supported: same as @p input
+     * @param[in] act_info Activation layer information.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+
+private:
+    ActivationLayerInfo _act_info;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
index 7f1a35f..fa26b90 100644
--- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
+#include "src/core/NEON/kernels/NEArithmeticAdditionKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.h b/src/core/NEON/kernels/NEArithmeticAdditionKernel.h
new file mode 100644
index 0000000..2072ad9
--- /dev/null
+++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEARITHMETICADDITIONKERNEL_H
+#define ARM_COMPUTE_NEARITHMETICADDITIONKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform addition between two tensors */
+class NEArithmeticAdditionKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEArithmeticAdditionKernel";
+    }
+    /** Default constructor */
+    NEArithmeticAdditionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticAdditionKernel(const NEArithmeticAdditionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticAdditionKernel &operator=(const NEArithmeticAdditionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEArithmeticAdditionKernel(NEArithmeticAdditionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEArithmeticAdditionKernel &operator=(NEArithmeticAdditionKernel &&) = default;
+    /** Default destructor */
+    ~NEArithmeticAdditionKernel() = default;
+
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     *
+     * @param[in]  input1 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[in]  input2 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[out] output The output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
+     * @param[in]  policy Overflow policy.
+     */
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAdditionKernel
+     *
+     * @param[in] input1 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[in] input2 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[in] output The output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
+     * @param[in] policy Overflow policy.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Common signature for all the specialised add functions
+     *
+     * @param[in]  input1 First input tensor. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/S32/F32
+     * @param[in]  input2 Second input tensor. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/S32/F32
+     * @param[out] output The output tensor. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/S32/F32.
+     * @param[in]  policy Overflow policy.
+     * @param[in]  window Region on which to execute the kernel.
+     */
+    using AddFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const Window &window);
+    /** Add function to use for the particular tensor types passed to configure() */
+    AddFunction *_func;
+    ConvertPolicy _policy;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEARITHMETICADDITIONKERNEL_H */
diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
index 49e503f..bdd356a 100644
--- a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
+#include "src/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.h b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.h
new file mode 100644
index 0000000..69952d6
--- /dev/null
+++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H
+#define ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform subtraction between two tensors */
+class NEArithmeticSubtractionKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEArithmeticSubtractionKernel";
+    }
+    /** Default constructor */
+    NEArithmeticSubtractionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticSubtractionKernel(const NEArithmeticSubtractionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticSubtractionKernel &operator=(const NEArithmeticSubtractionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEArithmeticSubtractionKernel(NEArithmeticSubtractionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEArithmeticSubtractionKernel &operator=(NEArithmeticSubtractionKernel &&) = default;
+    /** Default destructor */
+    ~NEArithmeticSubtractionKernel() = default;
+
+    /** Initialise the kernel's input and output.
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                          -> U8
+     *   - (U8,U8)                          -> S16
+     *   - (QASYMM8, QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (S16,U8)                         -> S16
+     *   - (U8,S16)                         -> S16
+     *   - (S16,S16)                        -> S16
+     *   - (S32,S32)                        -> S32
+     *   - (F16,F16)                        -> F16
+     *   - (F32,F32)                        -> F32
+     *
+     * @param[in]  input1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
+     * @param[in]  input2 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
+     * @param[out] output The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32.
+     * @param[in]  policy Overflow policy. Convert policy cannot be WRAP if datatype is quantized.
+     */
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtractionKernel
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                          -> U8
+     *   - (U8,U8)                          -> S16
+     *   - (QASYMM8, QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (S16,U8)                         -> S16
+     *   - (U8,S16)                         -> S16
+     *   - (S16,S16)                        -> S16
+     *   - (S32,S32)                        -> S32
+     *   - (F16,F16)                        -> F16
+     *   - (F32,F32)                        -> F32
+     *
+     * @param[in] input1 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
+     * @param[in] input2 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
+     * @param[in] output The output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32.
+     * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Common signature for all the specialised sub functions
+     *
+     * @param[in]  input1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
+     * @param[in]  input2 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
+     * @param[out] output The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32.
+     * @param[in]  window Region on which to execute the kernel.
+     * @param[in]  is_sat Flag to indicate if the policy is SATURATE.
+     */
+    using SubFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window, bool is_sat);
+    /** Sub function to use for the particular tensor types passed to configure() */
+    SubFunction *_func;
+    ConvertPolicy _policy;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H */
diff --git a/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp
index 65ac996..ddf6971 100644
--- a/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h"
+#include "src/core/NEON/kernels/NEBatchConcatenateLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.h b/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.h
new file mode 100644
index 0000000..b74a948
--- /dev/null
+++ b/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_NEBATCHCONCATENATEKERNEL_H
+#define ARM_COMPUTE_NEBATCHCONCATENATEKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Interface for the batch concatenate kernel.
+ *  The input tensor will be concatenated into the output tensor.
+ */
+class NEBatchConcatenateLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEBatchConcatenateLayerKernel";
+    }
+    /** Default constructor */
+    NEBatchConcatenateLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBatchConcatenateLayerKernel(const NEBatchConcatenateLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBatchConcatenateLayerKernel &operator=(const NEBatchConcatenateLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEBatchConcatenateLayerKernel(NEBatchConcatenateLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEBatchConcatenateLayerKernel &operator=(NEBatchConcatenateLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEBatchConcatenateLayerKernel() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]     input        Input tensor info. Data types supported: All.
+     * @param[in]     batch_offset The offset on axis # 3.
+     * @param[in,out] output       Output tensor info. Data types supported: Same as @p input.
+     *
+     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
+     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
+     *
+     */
+    void configure(const ITensorInfo *input, unsigned int batch_offset, ITensorInfo *output);
+    /**  Static function to check if given info will lead to a valid configuration of @ref NEBatchConcatenateLayerKernel
+     *
+     * @param[in] input        Input tensor info. Data types supported: All.
+     * @param[in] batch_offset The offset on axis # 3.
+     * @param[in] output       Output tensor info. Data types supported: Same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, unsigned int batch_offset, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+
+private:
+    using BatchConcatFunction = void(const ITensor *in, ITensor *out, unsigned int batch_offset, const Window &window);
+
+private:
+    BatchConcatFunction *_func;
+    unsigned int         _batch_offset;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEBATCHCONCATENATEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index bda3966..afb08e5 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
+#include "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
new file mode 100644
index 0000000..9312073
--- /dev/null
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H
+#define ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Interface for the batch normalization layer kernel.
+ */
+class NEBatchNormalizationLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEBatchNormalizationLayerKernel";
+    }
+    /** Default constructor */
+    NEBatchNormalizationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBatchNormalizationLayerKernel(const NEBatchNormalizationLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBatchNormalizationLayerKernel &operator=(const NEBatchNormalizationLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    NEBatchNormalizationLayerKernel(NEBatchNormalizationLayerKernel &&) = default;
+    /** Default move assignment operator */
+    NEBatchNormalizationLayerKernel &operator=(NEBatchNormalizationLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEBatchNormalizationLayerKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @note If the output tensor is a nullptr, the batch normalization function will be performed in-place
+     *
+     * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will store the result.
+     *                          3 lower dimensions represent a single input with dimensions [width, height, FM].
+     *                          The rest are optional and used for representing batches. Data types supported: F16/F32.
+     * @param[out]     output   Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     * @param[in]      mean     Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]      var      Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]      beta     (Optional) Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for beta is 0. Data types supported: Same as @p input
+     * @param[in]      gamma    (Optional) Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for gamma is 1. Data types supported: Same as @p input
+     * @param[in]      epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
+     * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+     */
+    void configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta = nullptr, const ITensor *gamma = nullptr, float epsilon = 0.001f,
+                   ActivationLayerInfo act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref NEBatchNormalizationLayerKernel
+     *
+     * @param[in] input    Source tensor info. In case of @p output tensor = nullptr, this tensor will store the result.
+     *                     3 lower dimensions represent a single input with dimensions [width, height, FM].
+     *                     The rest are optional and used for representing batches. Data types supported: F16/F32.
+     * @param[in] output   Destination tensor info. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     * @param[in] mean     Mean values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in] var      Variance values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in] beta     (Optional) Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for beta is 0. Data types supported: Same as @p input
+     * @param[in] gamma    (Optional) Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for gamma is 1. Data types supported: Same as @p input
+     * @param[in] epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                           const ITensorInfo *mean, const ITensorInfo *var,
+                           const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
+                           float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Configure execution function in case of non-fused activation **/
+    void configure_non_fused();
+    /** Configure execution function in case of fused activation **/
+    void configure_fused();
+
+    /** Template function to run batch normalization on fp32
+     *
+     * @tparam T                Specialization data type
+     * @tparam fused_activation Boolean that flags if its a fused activation or not
+     * @tparam F                Activation function functor to run
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename T, bool fused_activation, typename F>
+    void batch_normalization_nchw(const Window &window);
+    /** Template function to run batch normalization on fp32 on tensors with NHWC format
+     *
+     * @tparam T                Specialization data type
+     * @tparam fused_activation Boolean that flags if its a fused activation or not
+     * @tparam F                Activation function functor to run
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename T, bool fused_activation, typename F>
+    void batch_normalization_nhwc(const Window &window);
+    /** Common signature for all the batch normalization functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using BatchNormFunctionPtr = void (NEBatchNormalizationLayerKernel::*)(const Window &window);
+
+private:
+    BatchNormFunctionPtr _func;
+    ITensor             *_input;
+    ITensor             *_output;
+    const ITensor       *_mean;
+    const ITensor       *_var;
+    const ITensor       *_gamma;
+    const ITensor       *_beta;
+    float                _epsilon;
+    ActivationLayerInfo  _act_info;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
index e24d7b6..10207b9 100644
--- a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEBatchToSpaceLayerKernel.h"
+#include "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
diff --git a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
new file mode 100644
index 0000000..26e8224
--- /dev/null
+++ b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEBATCHTOSPACELAYERKERNEL_H
+#define ARM_COMPUTE_NEBATCHTOSPACELAYERKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the batch to space kernel */
+class NEBatchToSpaceLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEBatchToSpaceLayerKernel";
+    }
+    /** Default constructor */
+    NEBatchToSpaceLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBatchToSpaceLayerKernel(const NEBatchToSpaceLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBatchToSpaceLayerKernel &operator=(const NEBatchToSpaceLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEBatchToSpaceLayerKernel(NEBatchToSpaceLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEBatchToSpaceLayerKernel &operator=(NEBatchToSpaceLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEBatchToSpaceLayerKernel() = default;
+    /** Initialise the kernel's inputs and output.
+     *
+     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
+     * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
+     * @param[out] output      Tensor output. Data types supported: same as @p input
+     */
+    void configure(const ITensor *input, const ITensor *block_shape, ITensor *output);
+    /** Initialise the kernel's inputs and output (Static block shape).
+     *
+     * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
+     * @param[in]  block_shape_x Block shape x value.
+     * @param[in]  block_shape_y Block shape y value.
+     * @param[out] output        Tensor output. Data types supported: same as @p input
+     */
+    void configure(const ITensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEBatchToSpaceLayerKernel
+     *
+     * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
+     * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32
+     * @param[in] output      Tensor output. Data types supported: same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEBatchToSpaceLayerKernel (Static block shape).
+     *
+     * @param[in] input         Tensor input. Supported tensor rank: 4. Data types supported: All.
+     * @param[in] block_shape_x Block shape x value.
+     * @param[in] block_shape_y Block shape y value.
+     * @param[in] output        Tensor output. Data types supported: same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input;       /**< Source tensor */
+    const ITensor *_block_shape; /**< Block shape tensor */
+    ITensor       *_output;      /**< Destination tensor */
+    DataLayout     _data_layout; /**< Data layout to  be used at run-time */
+
+    int32_t _block_shape_x;
+    int32_t _block_shape_y;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEBATCHTOSPACELAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
index 2d49ff8..4f4de70 100644
--- a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h"
+#include "src/core/NEON/kernels/NEBitwiseAndKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.h b/src/core/NEON/kernels/NEBitwiseAndKernel.h
new file mode 100644
index 0000000..e4603f6
--- /dev/null
+++ b/src/core/NEON/kernels/NEBitwiseAndKernel.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEBITWISEANDKERNEL_H
+#define ARM_COMPUTE_NEBITWISEANDKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform bitwise AND between XY-planes of two tensors
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = input1(x,y) \land input2(x,y) @f]
+ */
+class NEBitwiseAndKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEBitwiseAndKernel";
+    }
+    /** Default constructor */
+    NEBitwiseAndKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseAndKernel(const NEBitwiseAndKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseAndKernel &operator=(const NEBitwiseAndKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEBitwiseAndKernel(NEBitwiseAndKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEBitwiseAndKernel &operator=(NEBitwiseAndKernel &&) = default;
+    /** Default destructor */
+    ~NEBitwiseAndKernel() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]  input1 An input tensor. Data type supported: U8.
+     * @param[in]  input2 An input tensor. Data type supported: U8
+     * @param[out] output Output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input1; /**< Source tensor 1 */
+    const ITensor *_input2; /**< Source tensor 2 */
+    ITensor       *_output; /**< Destination tensor */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEBITWISEANDKERNEL_H */
diff --git a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
index eed9b27..c69c4ea 100644
--- a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h"
+#include "src/core/NEON/kernels/NEBitwiseNotKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
diff --git a/src/core/NEON/kernels/NEBitwiseNotKernel.h b/src/core/NEON/kernels/NEBitwiseNotKernel.h
new file mode 100644
index 0000000..ba47c38
--- /dev/null
+++ b/src/core/NEON/kernels/NEBitwiseNotKernel.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEBITWISENOTKERNEL_H
+#define ARM_COMPUTE_NEBITWISENOTKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform bitwise NOT operation
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = \lnot input(x,y) @f]
+ */
+class NEBitwiseNotKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEBitwiseNotKernel";
+    }
+    /** Default constructor */
+    NEBitwiseNotKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseNotKernel(const NEBitwiseNotKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseNotKernel &operator=(const NEBitwiseNotKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEBitwiseNotKernel(NEBitwiseNotKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEBitwiseNotKernel &operator=(NEBitwiseNotKernel &&) = default;
+    /** Default destructor */
+    ~NEBitwiseNotKernel() = default;
+    /** Initialise the kernel's input and output
+     *
+     * @param[in]  input  An input tensor. Data type supported: U8.
+     * @param[out] output The output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input;  /**< Source tensor */
+    ITensor       *_output; /**< Destination tensor */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEBITWISENOTKERNEL_H */
diff --git a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
index f96117e..875e639 100644
--- a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h"
+#include "src/core/NEON/kernels/NEBitwiseOrKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
diff --git a/src/core/NEON/kernels/NEBitwiseOrKernel.h b/src/core/NEON/kernels/NEBitwiseOrKernel.h
new file mode 100644
index 0000000..40ef757
--- /dev/null
+++ b/src/core/NEON/kernels/NEBitwiseOrKernel.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEBITWISEORKERNEL_H
+#define ARM_COMPUTE_NEBITWISEORKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform bitwise inclusive OR between two tensors
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = input1(x,y) \lor input2(x,y) @f]
+ */
+class NEBitwiseOrKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEBitwiseOrKernel";
+    }
+    /** Default constructor */
+    NEBitwiseOrKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseOrKernel(const NEBitwiseOrKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseOrKernel &operator=(const NEBitwiseOrKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEBitwiseOrKernel(NEBitwiseOrKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEBitwiseOrKernel &operator=(NEBitwiseOrKernel &&) = default;
+    /** Default destructor */
+    ~NEBitwiseOrKernel() = default;
+    /** Initialise the kernel's inputs and output.
+     *
+     * @param[in]  input1 An input tensor. Data type supported: U8.
+     * @param[in]  input2 An input tensor. Data type supported: U8
+     * @param[out] output Output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input1; /**< Source tensor 1 */
+    const ITensor *_input2; /**< Source tensor 2 */
+    ITensor       *_output; /**< Destination tensor */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEBITWISEORKERNEL_H */
diff --git a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
index 45d2b0a..603b49d 100644
--- a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h"
+#include "src/core/NEON/kernels/NEBitwiseXorKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
diff --git a/src/core/NEON/kernels/NEBitwiseXorKernel.h b/src/core/NEON/kernels/NEBitwiseXorKernel.h
new file mode 100644
index 0000000..24d07a6
--- /dev/null
+++ b/src/core/NEON/kernels/NEBitwiseXorKernel.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEBITWISEXORKERNEL_H
+#define ARM_COMPUTE_NEBITWISEXORKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform bitwise exclusive OR (XOR) between two tensors
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = input1(x,y) \oplus input2(x,y) @f]
+ */
+class NEBitwiseXorKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEBitwiseXorKernel";
+    }
+    /** Default constructor */
+    NEBitwiseXorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseXorKernel(const NEBitwiseXorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseXorKernel &operator=(const NEBitwiseXorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEBitwiseXorKernel(NEBitwiseXorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEBitwiseXorKernel &operator=(NEBitwiseXorKernel &&) = default;
+    /** Default destructor */
+    ~NEBitwiseXorKernel() = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input1 An input tensor. Data type supported: U8.
+     * @param[in]  input2 An input tensor. Data type supported: U8
+     * @param[out] output The output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input1; /**< Source tensor 1 */
+    const ITensor *_input2; /**< Source tensor 2 */
+    ITensor       *_output; /**< Destination tensor */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEBITWISEXORKERNEL_H */
diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
index 5a18e88..03d6e1c 100644
--- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
+++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEBoundingBoxTransformKernel.h"
+#include "src/core/NEON/kernels/NEBoundingBoxTransformKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
new file mode 100644
index 0000000..c080ce6
--- /dev/null
+++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEBOUNDINGBOXTRANSFORMKERNEL_H
+#define ARM_COMPUTE_NEBOUNDINGBOXTRANSFORMKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the bounding box kernel */
+class NEBoundingBoxTransformKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEBoundingBoxTransformKernel";
+    }
+
+    /** Default constructor */
+    NEBoundingBoxTransformKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBoundingBoxTransformKernel(const NEBoundingBoxTransformKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBoundingBoxTransformKernel &operator=(const NEBoundingBoxTransformKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEBoundingBoxTransformKernel(NEBoundingBoxTransformKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEBoundingBoxTransformKernel &operator=(NEBoundingBoxTransformKernel &&) = default;
+    /** Default destructor */
+    ~NEBoundingBoxTransformKernel() = default;
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  boxes      Source tensor. Bounding box proposals in pixel coordinates. Size(M, 4), format [x1, y1, x2, y2]. Data types supported: QASYMM16/F16/F32.
+     * @param[out] pred_boxes Destination tensor. Pixel coordinates of the transformed bounding boxes. Size (M, 4*K), format [x1, y1, x2, y2]. Data types supported: Same as @p input
+     * @param[in]  deltas     Bounding box translations and scales. Size (M, 4*K), format [dx, dy, dw, dh], K  is the number of classes.
+     *                        Data types supported: QASYMM8 if @p input is QASYMM16, otherwise same as @p input.
+     * @param[in]  info       Contains BoundingBox operation information described in @ref BoundingBoxTransformInfo.
+     *
+     * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
+     *
+     */
+    void configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform
+     *
+     * @param[in] boxes      Source tensor info. Bounding box proposals in pixel coordinates. Size(M, 4), format [x1, y1, x2, y2]. Data types supported: QASYMM16/F16/F32.
+     * @param[in] pred_boxes Destination tensor info. Pixel coordinates of the transformed bounding boxes. Size (M, 4*K), format [x1, y1, x2, y2]. Data types supported: Same as @p input
+     * @param[in] deltas     Bounding box translations and scales. Size (M, 4*K), format [dx, dy, dw, dh], K  is the number of classes.
+     *                       Data types supported: QASYMM8 if @p input is QASYMM16, otherwise same as @p input.
+     * @param[in] info       Contains BoundingBox operation information described in @ref BoundingBoxTransformInfo.
+     *
+     * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
+     *
+     * @return a Status
+     */
+    static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    template <typename T>
+    void internal_run(const Window &window);
+
+    const ITensor           *_boxes;
+    ITensor                 *_pred_boxes;
+    const ITensor           *_deltas;
+    BoundingBoxTransformInfo _bbinfo;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEBOUNDINGBOXTRANSFORMKERNEL_H */
diff --git a/src/core/NEON/kernels/NEBox3x3Kernel.cpp b/src/core/NEON/kernels/NEBox3x3Kernel.cpp
index 1177f6f..2aa8aa8 100644
--- a/src/core/NEON/kernels/NEBox3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEBox3x3Kernel.cpp
@@ -21,14 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEBox3x3Kernel.h"
+#include "src/core/NEON/kernels/NEBox3x3Kernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/INEKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
diff --git a/src/core/NEON/kernels/NEBox3x3Kernel.h b/src/core/NEON/kernels/NEBox3x3Kernel.h
new file mode 100644
index 0000000..f6a64a7
--- /dev/null
+++ b/src/core/NEON/kernels/NEBox3x3Kernel.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEBOX3x3KERNEL_H
+#define ARM_COMPUTE_NEBOX3x3KERNEL_H
+
+#include "src/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform a Box 3x3 filter */
+class NEBox3x3Kernel : public INESimpleKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEBox3x3Kernel";
+    }
+    /** Default constructor */
+    NEBox3x3Kernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBox3x3Kernel(const NEBox3x3Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBox3x3Kernel &operator=(const NEBox3x3Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEBox3x3Kernel(NEBox3x3Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEBox3x3Kernel &operator=(NEBox3x3Kernel &&) = default;
+    /** Default destructor */
+    ~NEBox3x3Kernel() = default;
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output           Destination tensor. Data type supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+};
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+/** NEON kernel to perform a Box 3x3 filter for FP16 datatype
+ */
+class NEBox3x3FP16Kernel : public NEBox3x3Kernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEBox3x3FP16Kernel";
+    }
+    /** Default constructor */
+    NEBox3x3FP16Kernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBox3x3FP16Kernel(const NEBox3x3FP16Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBox3x3FP16Kernel &operator=(const NEBox3x3FP16Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEBox3x3FP16Kernel(NEBox3x3FP16Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEBox3x3FP16Kernel &operator=(NEBox3x3FP16Kernel &&) = default;
+    /** Default destructor */
+    ~NEBox3x3FP16Kernel() = default;
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+};
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+/** NEON kernel to perform a Box 3x3 filter for FP16 datatype */
+using NEBox3x3FP16Kernel = NEBox3x3Kernel;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEBOX3x3KERNEL_H */
diff --git a/src/core/NEON/kernels/NECannyEdgeKernel.cpp b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
index da33c1b..7a2bf20 100644
--- a/src/core/NEON/kernels/NECannyEdgeKernel.cpp
+++ b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h"
+#include "src/core/NEON/kernels/NECannyEdgeKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -41,22 +41,14 @@
 #include <cstdint>
 #include <tuple>
 
-using namespace arm_compute;
-
 namespace arm_compute
 {
-class Coordinates;
-} // namespace arm_compute
-
 namespace
 {
 constexpr int NO_EDGE = 0;
 constexpr int EDGE    = 255;
 constexpr int MAYBE   = 127;
-} // namespace
 
-namespace
-{
 inline uint8x8_t phase_quantization(const float32x4x2_t &gx, const float32x4x2_t &gy)
 {
     // Constant use for evaluating score1 and score3
@@ -873,6 +865,8 @@
 }
 } // namespace
 
+NEGradientKernel::~NEGradientKernel() = default;
+
 NEGradientKernel::NEGradientKernel()
     : _func(nullptr), _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
 {
@@ -961,6 +955,7 @@
     gx, gy, magnitude, phase);
 }
 
+NEEdgeNonMaxSuppressionKernel::~NEEdgeNonMaxSuppressionKernel() = default;
 NEEdgeNonMaxSuppressionKernel::NEEdgeNonMaxSuppressionKernel()
     : _func(nullptr), _magnitude(nullptr), _phase(nullptr), _output(nullptr), _lower_thr(0), _upper_thr(0)
 {
@@ -1045,6 +1040,7 @@
     magnitude, phase, output);
 }
 
+NEEdgeTraceKernel::~NEEdgeTraceKernel() = default;
 NEEdgeTraceKernel::NEEdgeTraceKernel()
     : _input(nullptr), _output(nullptr)
 {
@@ -1123,3 +1119,4 @@
     },
     input, output);
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/NEON/kernels/NECannyEdgeKernel.h b/src/core/NEON/kernels/NECannyEdgeKernel.h
new file mode 100644
index 0000000..eff7352
--- /dev/null
+++ b/src/core/NEON/kernels/NECannyEdgeKernel.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NECANNYEDGEKERNEL_H
+#define ARM_COMPUTE_NECANNYEDGEKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Computes magnitude and quantised phase from inputs gradients. */
+class NEGradientKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEGradientKernel";
+    }
+    /** Default constructor */
+    NEGradientKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGradientKernel(const NEGradientKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGradientKernel &operator=(const NEGradientKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGradientKernel(NEGradientKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGradientKernel &operator=(NEGradientKernel &&) = default;
+    /** Default destructor */
+    ~NEGradientKernel();
+
+    /** Initialise the kernel's sources, destinations and border mode.
+     *
+     * @note gx, gy and magnitude must all be the same size (either 16 or 32)
+     *
+     * @param[in]  gx        Source tensor - Gx component. Data type supported: S16/S32.
+     * @param[in]  gy        Source tensor - Gy component. Data type supported: same as @p gx.
+     * @param[out] magnitude Destination tensor - Magnitude. Data type supported: U16 (if the data type of @p gx is S16) / U32 (if the data type of @p gx is S32).
+     * @param[out] phase     Destination tensor - Quantized phase. Data type supported: U8.
+     * @param[in]  norm_type Normalization type. If 1, L1-Norm otherwise L2-Norm
+     */
+    virtual void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+protected:
+    /** Common signature for all the specialised gradient functions
+     *
+     * @param[in]  gx_ptr        Pointer to the first input tensor.
+     * @param[in]  gy_ptr        Pointer to the second input tensor.
+     * @param[out] magnitude_ptr Pointer to the first output tensor
+     * @param[out] phase_ptr     Pointer to the second output tensor
+     */
+    using GradientFunction = void(const void *__restrict gx_ptr, const void *__restrict gy_ptr, void *__restrict magnitude_ptr, void *__restrict phase_ptr);
+
+    GradientFunction *_func;      /**< Gradient function to use for the particular tensor types passed to configure() */
+    const ITensor    *_gx;        /**< Source tensor - Gx component */
+    const ITensor    *_gy;        /**< Source tensor - Gy component */
+    ITensor          *_magnitude; /**< Destination tensor - Magnitude */
+    ITensor          *_phase;     /**< Destination tensor - Quantized phase */
+};
+
+/** NEON kernel to perform Non-Maxima suppression for Canny Edge.
+ *
+ * @note This kernel is meant to be used alongside CannyEdge and performs a non-maxima suppression using magnitude and phase of input
+ *       to characterize points as possible edges. Thus, at the end, each point will be set to EDGE, NO_EDGE or MAYBE.
+ *
+ * @note Hysteresis is computed in @ref NEEdgeTraceKernel
+ */
+class NEEdgeNonMaxSuppressionKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEEdgeNonMaxSuppressionKernel";
+    }
+    /** Default constructor */
+    NEEdgeNonMaxSuppressionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEEdgeNonMaxSuppressionKernel(const NEEdgeNonMaxSuppressionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEEdgeNonMaxSuppressionKernel &operator=(const NEEdgeNonMaxSuppressionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEEdgeNonMaxSuppressionKernel(NEEdgeNonMaxSuppressionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEEdgeNonMaxSuppressionKernel &operator=(NEEdgeNonMaxSuppressionKernel &&) = default;
+    /** Default destructor */
+    ~NEEdgeNonMaxSuppressionKernel();
+
+    /** Initialise the kernel's sources, destination and border mode.
+     *
+     * @param[in]  magnitude        Source tensor - Magnitude. Data type supported: U16/U32.
+     * @param[in]  phase            Source tensor - Quantized phase. Data type supported: U8.
+     * @param[out] output           Output tensor. Data type supported: U8. It will be filled with 0 for "no edge", 127 for "maybe", 255 for "edge"
+     * @param[in]  upper_thr        Upper threshold used for the hysteresis
+     * @param[in]  lower_thr        Lower threshold used for the hysteresis
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *magnitude, const ITensor *phase, ITensor *output, int32_t upper_thr, int32_t lower_thr, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Common signature for all the specialised non-maxima suppression functions
+     *
+     * @param[in]  magnitude_ptr Pointer to the first input tensor.
+     * @param[in]  phase_ptr     Pointer to the second input tensor.
+     * @param[out] output_ptr    Pointer to the output tensor
+     * @param[in]  stride_mag    Stride of the magnitude tensor
+     * @param[in]  upper_thr     Upper threshold used for the hysteresis
+     * @param[in]  lower_thr     Lower threshold used for the hysteresis
+     */
+    using EdgeNonMaxSupprFunction = void(const void *__restrict magnitude_ptr, const void *__restrict phase_ptr, void *__restrict output_ptr, const uint32_t stride_mag, const int32_t upper_thr,
+                                         const int32_t lower_thr);
+
+    EdgeNonMaxSupprFunction *_func;      /**< Non-Maxima suppression function to use for the particular tensor types passed to configure() */
+    const ITensor           *_magnitude; /**< Source tensor - Magnitude */
+    const ITensor           *_phase;     /**< Source tensor - Quantized phase */
+    ITensor                 *_output;    /**< Destination tensor */
+    int32_t                  _lower_thr; /**< Lower threshold used for the hysteresis */
+    int32_t                  _upper_thr; /**< Upper threshold used for the hysteresis */
+};
+
+/** NEON kernel to perform Edge tracing */
+class NEEdgeTraceKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEEdgeTraceKernel";
+    }
+    /** Default constructor */
+    NEEdgeTraceKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEEdgeTraceKernel(const NEEdgeTraceKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEEdgeTraceKernel &operator=(const NEEdgeTraceKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEEdgeTraceKernel(NEEdgeTraceKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEEdgeTraceKernel &operator=(NEEdgeTraceKernel &&) = default;
+    /** Default destructor */
+    ~NEEdgeTraceKernel();
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in,out] input  Source tensor. Data type supported: U8. Must contain 0 for "no edge", 127 for "maybe", 255 for "edge"
+     * @param[in,out] output Destination tensor. Data type supported: U8. Must be initialized to 0 (No edge).
+     */
+    void configure(ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+    bool       is_parallelisable() const override;
+
+private:
+    ITensor *_input;  /**< Source tensor */
+    ITensor *_output; /**< Destination tensor */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NECANNYEDGEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEChannelCombineKernel.cpp b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
index 7bd3808..6bfd4c5 100644
--- a/src/core/NEON/kernels/NEChannelCombineKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEChannelCombineKernel.h"
+#include "src/core/NEON/kernels/NEChannelCombineKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEChannelCombineKernel.h b/src/core/NEON/kernels/NEChannelCombineKernel.h
new file mode 100644
index 0000000..a3372be
--- /dev/null
+++ b/src/core/NEON/kernels/NEChannelCombineKernel.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NECHANNELCOMBINEKERNEL_H
+#define ARM_COMPUTE_NECHANNELCOMBINEKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+#include <array>
+#include <cstdint>
+
+namespace arm_compute
+{
+class IMultiImage;
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the channel combine kernel */
+class NEChannelCombineKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEChannelCombineKernel";
+    }
+    /** Default constructor */
+    NEChannelCombineKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEChannelCombineKernel(const NEChannelCombineKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEChannelCombineKernel &operator=(const NEChannelCombineKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEChannelCombineKernel(NEChannelCombineKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEChannelCombineKernel &operator=(NEChannelCombineKernel &&) = default;
+    /** Default destructor */
+    ~NEChannelCombineKernel() = default;
+
+    /** Configure function's inputs and outputs.
+     *
+     * @param[in]  plane0 The 2D plane that forms channel 0. Data type supported: U8
+     * @param[in]  plane1 The 2D plane that forms channel 1. Data type supported: U8
+     * @param[in]  plane2 The 2D plane that forms channel 2. Data type supported: U8
+     * @param[in]  plane3 The 2D plane that forms channel 3. Data type supported: U8
+     * @param[out] output The single planar output tensor. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
+     */
+    void configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output);
+    /** Configure function's inputs and outputs.
+     *
+     * @param[in]  plane0 The 2D plane that forms channel 0. Data type supported: U8
+     * @param[in]  plane1 The 2D plane that forms channel 1. Data type supported: U8
+     * @param[in]  plane2 The 2D plane that forms channel 2. Data type supported: U8
+     * @param[out] output The multi planar output tensor. Formats supported: NV12/NV21/IYUV/YUV444
+     */
+    void configure(const IImage *plane0, const IImage *plane1, const IImage *plane2, IMultiImage *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    bool is_parallelisable() const override;
+
+private:
+    /** Combine 3 planes to form a three channel single plane tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void combine_3C(const Window &win);
+    /** Combine 4 planes to form a four channel single plane tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void combine_4C(const Window &win);
+    /** Combine 3 planes to form a single plane YUV tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    template <bool is_yuyv>
+    void combine_YUV_1p(const Window &win);
+    /** Combine 3 planes to form a two plane YUV tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void combine_YUV_2p(const Window &win);
+    /** Combine 3 planes to form a three plane YUV tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void combine_YUV_3p(const Window &win);
+    /** Copies a full plane to the output tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void copy_plane(const Window &win, uint32_t plane_id);
+    /** Common signature for all the specialised ChannelCombine functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using ChannelCombineFunction = void (NEChannelCombineKernel::*)(const Window &window);
+    /** ChannelCombine function to use for the particular tensor types passed to configure() */
+    ChannelCombineFunction _func;
+    std::array<const ITensor *, 4> _planes;
+    ITensor     *_output;
+    IMultiImage *_output_multi;
+    std::array<uint32_t, 3> _x_subsampling;
+    std::array<uint32_t, 3> _y_subsampling;
+    unsigned int _num_elems_processed_per_iteration;
+    bool         _is_parallelizable;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NECHANNELCOMBINEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEChannelExtractKernel.cpp b/src/core/NEON/kernels/NEChannelExtractKernel.cpp
index 86245ac..d0d1c68 100644
--- a/src/core/NEON/kernels/NEChannelExtractKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelExtractKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEChannelExtractKernel.h"
+#include "src/core/NEON/kernels/NEChannelExtractKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -29,11 +29,11 @@
 #include "arm_compute/core/IMultiImage.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/MultiImageInfo.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/INEKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
diff --git a/src/core/NEON/kernels/NEChannelExtractKernel.h b/src/core/NEON/kernels/NEChannelExtractKernel.h
new file mode 100644
index 0000000..0b2847d
--- /dev/null
+++ b/src/core/NEON/kernels/NEChannelExtractKernel.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NECHANNELEXTRACTKERNEL_H
+#define ARM_COMPUTE_NECHANNELEXTRACTKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class IMultiImage;
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the channel extract kernel */
+class NEChannelExtractKernel : public INESimpleKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEChannelExtractKernel";
+    }
+    /** Default constructor */
+    NEChannelExtractKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEChannelExtractKernel(const NEChannelExtractKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEChannelExtractKernel &operator=(const NEChannelExtractKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEChannelExtractKernel(NEChannelExtractKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEChannelExtractKernel &operator=(NEChannelExtractKernel &&) = default;
+    /** Default destructor */
+    ~NEChannelExtractKernel() = default;
+
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input   Source tensor. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422
+     * @param[in]  channel Channel to extract.
+     * @param[out] output  Destination tensor. Format supported: U8
+     */
+    void configure(const ITensor *input, Channel channel, ITensor *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input   Multi-planar source image. Formats supported: NV12/NV21/IYUV/YUV444
+     * @param[in]  channel Channel to extract.
+     * @param[out] output  Single-planar destination image. Format supported: U8
+     */
+    void configure(const IMultiImage *input, Channel channel, IImage *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Extract one channel from a two channel planar tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void extract_1C_from_2C_img(const Window &win);
+    /** Extract one channel from a three channel planar tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void extract_1C_from_3C_img(const Window &win);
+    /** Extract one channel from a four channel planar tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void extract_1C_from_4C_img(const Window &win);
+    /** Extract U/V channel from a single planar YUVY/UYVY tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void extract_YUYV_uv(const Window &win);
+    /** Copies a full plane to the output tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void copy_plane(const Window &win);
+    /** Common signature for all the specialised ChannelExtract functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using ChannelExtractFunction = void (NEChannelExtractKernel::*)(const Window &window);
+    /** ChannelExtract function to use for the particular tensor types passed to configure() */
+    ChannelExtractFunction _func;
+    unsigned int           _lut_index;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NECHANNELEXTRACTKERNEL_H */
diff --git a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
index 6d04d71..6e16f24 100644
--- a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
+#include "src/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.h b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.h
new file mode 100644
index 0000000..c7d09df
--- /dev/null
+++ b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NECHANNELSHUFFLELAYERKERNEL_H
+#define ARM_COMPUTE_NECHANNELSHUFFLELAYERKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Interface for the channel shuffle kernel */
+class NEChannelShuffleLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEChannelShuffleLayerKernel";
+    }
+    /** Default constructor */
+    NEChannelShuffleLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEChannelShuffleLayerKernel(const NEChannelShuffleLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEChannelShuffleLayerKernel &operator=(const NEChannelShuffleLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEChannelShuffleLayerKernel(NEChannelShuffleLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEChannelShuffleLayerKernel &operator=(NEChannelShuffleLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEChannelShuffleLayerKernel() = default;
+    /** Configure function's inputs and outputs.
+     *
+     * @param[in]  input      Input tensor. Data types supported: All
+     * @param[out] output     Output tensor. Data type supported: Same as @p input
+     * @param[in]  num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
+     */
+    void configure(const ITensor *input, ITensor *output, unsigned int num_groups);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEChannelShuffleLayerKernel
+     *
+     * @param[in]  input      Input tensor. Data types supported: All
+     * @param[out] output     Output tensor. Data type supported: Same as @p input
+     * @param[in]  num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input;
+    ITensor       *_output;
+    unsigned int   _num_groups;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NECHANNELSHUFFLELAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NECol2ImKernel.cpp b/src/core/NEON/kernels/NECol2ImKernel.cpp
index f319237..97b68d1 100644
--- a/src/core/NEON/kernels/NECol2ImKernel.cpp
+++ b/src/core/NEON/kernels/NECol2ImKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
+#include "src/core/NEON/kernels/NECol2ImKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NECol2ImKernel.h b/src/core/NEON/kernels/NECol2ImKernel.h
new file mode 100644
index 0000000..59d1d74
--- /dev/null
+++ b/src/core/NEON/kernels/NECol2ImKernel.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NECOL2IMKERNEL_H
+#define ARM_COMPUTE_NECOL2IMKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+#include "arm_compute/core/Size2D.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform col2im reshaping.
+ *
+ * Rearranges each matrix column into image blocks. It's the inverse operation of @ref NEIm2ColKernel.
+ *
+ * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3:
+ *
+ * @f[
+ * \left( \begin{array}{ccccccccc}
+ * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccc}
+ * a0 & a1 & a2 \\
+ * a3 & a4 & a5 \\
+ * a6 & a7 & a8 \\
+ * \end{array} \right)
+ * @f]
+ */
+class NECol2ImKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NECol2ImKernel";
+    }
+    /** Default constructor */
+    NECol2ImKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECol2ImKernel(const NECol2ImKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECol2ImKernel &operator=(const NECol2ImKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NECol2ImKernel(NECol2ImKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NECol2ImKernel &operator=(NECol2ImKernel &&) = default;
+    /** Default destructor */
+    ~NECol2ImKernel() = default;
+
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input          The input tensor to convert. Data types supported: All
+     * @param[out] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
+     *                            while the rest represent batch of outputs. Data types supported: Same as @p input
+     * @param[in]  convolved_dims Output convolved dimensions.
+     */
+    void configure(const ITensor *input, ITensor *output, const Size2D &convolved_dims);
+    /** Static function to check if given info will lead to a valid configuration of @ref NECol2ImKernel
+     *
+     * @param[in] input          The input tensor to convert. Data types supported: All
+     * @param[in] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
+     *                           while the rest represent batch of outputs. Data types supported: Same as @p input
+     * @param[in] convolved_dims Output convolved dimensions.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Template function to run the col2im
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename T>
+    void run_col2im(const Window &window);
+
+    /** Common signature for all the specialised col2im functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using Col2ImFunctionPtr = void (NECol2ImKernel::*)(const Window &window);
+
+    Col2ImFunctionPtr _func;
+    const ITensor    *_input;
+    ITensor          *_output;
+    Size2D            _convolved_dims;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NECOL2IMKERNEL_H */
diff --git a/src/core/NEON/kernels/NEColorConvertKernel.cpp b/src/core/NEON/kernels/NEColorConvertKernel.cpp
index f933a2a..23270d4 100644
--- a/src/core/NEON/kernels/NEColorConvertKernel.cpp
+++ b/src/core/NEON/kernels/NEColorConvertKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h"
+#include "src/core/NEON/kernels/NEColorConvertKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEColorConvertKernel.h b/src/core/NEON/kernels/NEColorConvertKernel.h
new file mode 100644
index 0000000..1adb624
--- /dev/null
+++ b/src/core/NEON/kernels/NEColorConvertKernel.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_COLORCONVERTKERNEL_H
+#define ARM_COMPUTE_COLORCONVERTKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class IMultiImage;
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the color convert kernel */
+class NEColorConvertKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEColorConvertKernel";
+    }
+    /** Default constructor */
+    NEColorConvertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEColorConvertKernel(const NEColorConvertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEColorConvertKernel &operator=(const NEColorConvertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEColorConvertKernel(NEColorConvertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEColorConvertKernel &operator=(NEColorConvertKernel &&) = default;
+    /** Default destructor */
+    ~NEColorConvertKernel() = default;
+
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888
+     * @param[out] output Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422),
+     *                                                          RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/),
+     *                                                          U8 (if the formats of @p input is RGB888)
+     */
+    void configure(const ITensor *input, ITensor *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
+     * @param[out] output Single-planar destination image. Formats supported: RGB888/RGBA8888
+     */
+    void configure(const IMultiImage *input, IImage *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
+     * @param[out] output Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888)
+     */
+    void configure(const IImage *input, IMultiImage *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
+     * @param[out] output Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of  @p input is IYUV)
+     */
+    void configure(const IMultiImage *input, IMultiImage *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    using ColorConvertFunction = void(const void *__restrict input_ptr, void *__restrict output_ptr, const Window &win);
+    const void           *_input;
+    void                 *_output;
+    ColorConvertFunction *_func;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NECOLORCONVERTKERNEL_H */
diff --git a/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp b/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp
index 8716cfd..597c283 100644
--- a/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp
+++ b/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
+#include "src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
diff --git a/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h b/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h
new file mode 100644
index 0000000..766ee88
--- /dev/null
+++ b/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTSKERNEL_H
+#define ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTSKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Interface to convert the 2D Fully Connected weights from NCHW to NHWC or vice versa.
+ *
+ * @note This function can be applied to the 2D weights used by a Fully Connected layer if:
+ *       - It follows a Convolution layer
+ *       - The data layout used by the network does not match the one the model has been trained in.
+ *
+ * @note This function assumes the weights are already reshaped (transposed)
+ */
+class NEConvertFullyConnectedWeightsKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEConvertFullyConnectedWeightsKernel";
+    }
+    /** Default constructor */
+    NEConvertFullyConnectedWeightsKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConvertFullyConnectedWeightsKernel(const NEConvertFullyConnectedWeightsKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConvertFullyConnectedWeightsKernel &operator=(const NEConvertFullyConnectedWeightsKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEConvertFullyConnectedWeightsKernel(NEConvertFullyConnectedWeightsKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEConvertFullyConnectedWeightsKernel &operator=(NEConvertFullyConnectedWeightsKernel &&) = default;
+    /** Default destructor */
+    ~NEConvertFullyConnectedWeightsKernel() = default;
+    /** Set the input and output tensor.
+     *
+     * @param[in]  input                Source weights tensor to convert. Must be 2 dimensional. Data types supported: All.
+     * @param[out] output               The converted weights tensor. Shape and Data Type: Same as @p input.
+     * @param[in]  original_input_shape Shape of the original input tensor (the one entering fully connected layer).
+     * @param[in]  data_layout          The data layout the weights have been trained in.
+     */
+    void configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEConvertFullyConnectedWeightsKernel
+     *
+     * @param[in] input                Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
+     * @param[in] output               The converted weights tensor info. Shape and Data Type: Same as @p input.
+     * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer).
+     * @param[in] data_layout          The data layout the weights have been trained in.
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, DataLayout data_layout);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Template function to run the permute
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename T>
+    void run_convert_fc_weights(const Window &window);
+
+    const ITensor *_input;
+    ITensor       *_output;
+    unsigned int   _factor1; /*  equals to the number of elements per original input plane if @p data_layout == NCHW; its number of channels otherwise */
+    unsigned int   _factor2; /*  equals to the number of elements per original input plane if @p data_layout == NHWC; its number of channels otherwise */
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTSKERNEL_H */
diff --git a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp
index bd8ea30..1f2170f 100644
--- a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp
+++ b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h
new file mode 100644
index 0000000..2f80361
--- /dev/null
+++ b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H
+#define ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** NEON kernel to convert asymmetric signed to asymmetric signed and vice-versa */
+class NEConvertQuantizedSignednessKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEConvertQuantizedSignednessKernel";
+    }
+    /** Default constructor */
+    NEConvertQuantizedSignednessKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    NEConvertQuantizedSignednessKernel(const NEConvertQuantizedSignednessKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    NEConvertQuantizedSignednessKernel &operator=(const NEConvertQuantizedSignednessKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEConvertQuantizedSignednessKernel(NEConvertQuantizedSignednessKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEConvertQuantizedSignednessKernel &operator=(NEConvertQuantizedSignednessKernel &&) = default;
+    /** Default destructor */
+    ~NEConvertQuantizedSignednessKernel() = default;
+    /** Initialize the kernel's input, output.
+     *
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED.
+     * @param[out] output Destination tensor. Data types supported: opposite of @p input.
+     */
+    void configure(const ITensor *input, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NECopyKernel
+     *
+     * @param[in] input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED.
+     * @param[in] output Destination tensor. Data types supported: opposite of @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input;
+    ITensor       *_output;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H */
diff --git a/src/core/NEON/kernels/NEConvolutionKernel.cpp b/src/core/NEON/kernels/NEConvolutionKernel.cpp
index 69b65b2..bac2743 100644
--- a/src/core/NEON/kernels/NEConvolutionKernel.cpp
+++ b/src/core/NEON/kernels/NEConvolutionKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
+#include "src/core/NEON/kernels/NEConvolutionKernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
diff --git a/src/core/NEON/kernels/NEConvolutionKernel.h b/src/core/NEON/kernels/NEConvolutionKernel.h
new file mode 100644
index 0000000..b8bf1d1
--- /dev/null
+++ b/src/core/NEON/kernels/NEConvolutionKernel.h
@@ -0,0 +1,299 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NECONVOLUTIONKERNEL_H
+#define ARM_COMPUTE_NECONVOLUTIONKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/NEON/INESimpleKernel.h"
+
+#include <array>
+#include <cstdint>
+#include <vector>
+
+namespace arm_compute
+{
+class ITensor;
+
+/****************************************************************************************\
+ *                                    Square Convolution                                *
+\****************************************************************************************/
+
+/** Interface for the kernel to run an arbitrary size convolution on a tensor. (Currently supports 3x3, 5x5, 7x7 and 9x9).
+ * The client can supply a convolution matrix \f$ C_{m,n} \f$.
+ * @f{eqnarray}{
+ *  k_0 &=& \frac{m}{2}  \\
+ *  l_0 &=& \frac{n}{2}  \\
+ *  sum &=& \sum_{k=0,l=0}^{k=m-1,l=n-1} input(x+k-k_0, y+l-l_0) C_{k,l}
+ *  @f}
+ *
+ * @note The above equation for this function is similar to the default OpenCV Filter2D function,
+ *       which actually computes a correlation and not a convolution.
+ *       In case of a real convolution the convolution matrix should be flipped both horizontally and vertically.
+ */
+template <unsigned int matrix_size>
+class NEConvolutionKernel : public INESimpleKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEConvolutionKernel";
+    }
+    /** Default constructor */
+    NEConvolutionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    NEConvolutionKernel(const NEConvolutionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    NEConvolutionKernel &operator=(const NEConvolutionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEConvolutionKernel(NEConvolutionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEConvolutionKernel &operator=(NEConvolutionKernel &&) = default;
+    /** Default destructor */
+    ~NEConvolutionKernel() = default;
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output           Destination tensor. Data types supported: U8, S16.
+     * @param[in]  conv             Convolution matrix to apply to the input tensor.
+     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+private:
+    template <typename OutputType>
+    void convolution(const Window &win);
+
+protected:
+    uint32_t _scale;                                             /**< scale of the convolution */
+    std::array<int16_t, matrix_size *matrix_size> _convolution;  /**< convolution matrix */
+};
+
+/** Interface for the kernel which applied a 3x3 convolution to a tensor.*/
+using NEConvolution3x3Kernel = NEConvolutionKernel<3>;
+/** Interface for the kernel which applied a 5x5 convolution to a tensor.*/
+using NEConvolution5x5Kernel = NEConvolutionKernel<5>;
+/** Interface for the kernel which applied a 7x7 convolution to a tensor.*/
+using NEConvolution7x7Kernel = NEConvolutionKernel<7>;
+///** Interface for the kernel which applied a 9x9 convolution to a tensor.*/
+using NEConvolution9x9Kernel = NEConvolutionKernel<9>;
+
+/****************************************************************************************\
+ *                              Separable Square Convolution                            *
+\****************************************************************************************/
+
+/** Kernel for the Horizontal pass of a Separable Convolution */
+template <unsigned int matrix_size>
+class NESeparableConvolutionHorKernel : public INESimpleKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NESeparableConvolutionHorKernel";
+    }
+    /** Default constructor */
+    NESeparableConvolutionHorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    NESeparableConvolutionHorKernel(const NESeparableConvolutionHorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    NESeparableConvolutionHorKernel &operator=(const NESeparableConvolutionHorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NESeparableConvolutionHorKernel(NESeparableConvolutionHorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NESeparableConvolutionHorKernel &operator=(NESeparableConvolutionHorKernel &&) = default;
+    /** Default destructor */
+    ~NESeparableConvolutionHorKernel() = default;
+
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output           Destination tensor. Data types supported: U16, S16, S32.
+     * @param[in]  conv_row         Convolution matrix to apply to the input tensor.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, const int16_t *conv_row, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Apply the object's convolution to the given window of the input tensor..
+     *
+     * @param[in] window Window to apply the convolution on.
+     */
+    template <typename OutputType>
+    void convolve(const Window &window);
+
+    std::array<int16_t, matrix_size> _conv_row; /**< Convolution coefficients */
+    BorderSize _border_size;                    /**< Border size */
+};
+
+/** Interface for the kernel which applied a 5x1 horizontal convolution to a tensor.*/
+using NESeparableConvolution5x5HorKernel = NESeparableConvolutionHorKernel<5>;
+/** Interface for the kernel which applied a 7x1 horizontal convolution to a tensor.*/
+using NESeparableConvolution7x7HorKernel = NESeparableConvolutionHorKernel<7>;
+/** Interface for the kernel which applied a 9x1 horizontal convolution to a tensor.*/
+using NESeparableConvolution9x9HorKernel = NESeparableConvolutionHorKernel<9>;
+
+/** Kernel for the Vertical pass of a Separable Convolution */
+template <unsigned int matrix_size>
+class NESeparableConvolutionVertKernel : public INESimpleKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NESeparableConvolutionVertKernel";
+    }
+    /** Default constructor */
+    NESeparableConvolutionVertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    NESeparableConvolutionVertKernel(const NESeparableConvolutionVertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    NESeparableConvolutionVertKernel &operator=(const NESeparableConvolutionVertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NESeparableConvolutionVertKernel(NESeparableConvolutionVertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NESeparableConvolutionVertKernel &operator=(NESeparableConvolutionVertKernel &&) = default;
+    /** Default destructor */
+    ~NESeparableConvolutionVertKernel() = default;
+
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U16, S16, S32.
+     * @param[out] output           Destination tensor, Data types supported: U8, S16.
+     * @param[in]  conv_col         Convolution matrix to apply to the input tensor.
+     * @param[in]  scale            Scale of the convolution matrix
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, const int16_t *conv_col, uint32_t scale, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Apply the object's convolution to the given window of the input tensor.
+     *  This function is used if the intermediate values have been stored as U16.
+     *
+     * @param[in] win Window to apply the convolution on.
+     */
+    template <typename OutputType>
+    void convolution_u16(const Window &win);
+    /** Apply the object's convolution to the given window of the input tensor.
+     *  This function is used if the intermediate values have been stored as S16.
+     *
+     * @param[in] win Window to apply the convolution on.
+     */
+    template <typename OutputType>
+    void convolution_s16(const Window &win);
+    /** Apply the object's convolution to the given window of the input tensor.
+     *  This function is used if the intermediate values have been stored as S32.
+     *
+     * @param[in] win Window to apply the convolution on.
+     */
+    template <typename OutputType>
+    void convolution_s32(const Window &win);
+
+    std::array<int16_t, matrix_size> _conv_col; /**< Convolution coefficients */
+    uint32_t _scale;                            /**< Convolution's scale */
+};
+
+/** Interface for the kernel which applied a 1x5 vertical convolution to a tensor.*/
+using NESeparableConvolution5x5VertKernel = NESeparableConvolutionVertKernel<5>;
+/** Interface for the kernel which applied a 1x7 vertical convolution to a tensor.*/
+using NESeparableConvolution7x7VertKernel = NESeparableConvolutionVertKernel<7>;
+/** Interface for the kernel which applied a 1x9 vertical convolution to a tensor.*/
+using NESeparableConvolution9x9VertKernel = NESeparableConvolutionVertKernel<9>;
+
+/****************************************************************************************\
+ *                                 Rectangle Convolution                                *
+\****************************************************************************************/
+
+/** Kernel for the running convolution on a rectangle matrix.
+ *
+ * @note Supports combinations of 3,5,7 and 9.
+ */
+class NEConvolutionRectangleKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEConvolutionRectangleKernel";
+    }
+    /** Default constructor */
+    NEConvolutionRectangleKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConvolutionRectangleKernel(NEConvolutionRectangleKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConvolutionRectangleKernel &operator=(NEConvolutionRectangleKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEConvolutionRectangleKernel(NEConvolutionRectangleKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEConvolutionRectangleKernel &operator=(NEConvolutionRectangleKernel &&) = default;
+    /** Default destructor */
+    ~NEConvolutionRectangleKernel() = default;
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output           Destination tensor, Data types supported: U8, S16.
+     * @param[in]  conv             Convolution matrix to apply to the input tensor.
+     * @param[in]  width            Width of convolution matrix (Number of columns)
+     * @param[in]  height           Height of convolution matrix (Number of rows)
+     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+private:
+    unsigned int get_index(uint32_t val);
+    /** Apply the object's convolution to the given window of the input tensor.
+     *
+     * @param[in] win Window to apply the convolution on.
+     */
+    template <typename OutputType, unsigned int rows, unsigned int cols>
+    void convolution(const Window &win);
+
+protected:
+    const ITensor            *_input;       /**< Input tensor */
+    ITensor                  *_output;      /**< Output tensor */
+    uint32_t                  _scale;       /**< Scale of the convolution */
+    std::vector<int16_t>      _convolution; /**< Convolution matrix */
+    BorderSize                _border_size; /**< Calculated border width */
+    uint32_t                  _func_idx;    /**< Index used to specify convolution function to be used */
+    const static unsigned int _nr_supported_sizes
+    {
+        4
+    }; /**< Number of supported permutations */
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NECONVOLUTIONKERNEL_H */
diff --git a/src/core/NEON/kernels/NECopyKernel.cpp b/src/core/NEON/kernels/NECopyKernel.cpp
index b299957..337c44c 100644
--- a/src/core/NEON/kernels/NECopyKernel.cpp
+++ b/src/core/NEON/kernels/NECopyKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
+#include "src/core/NEON/kernels/NECopyKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NECopyKernel.h b/src/core/NEON/kernels/NECopyKernel.h
new file mode 100644
index 0000000..62b7b80
--- /dev/null
+++ b/src/core/NEON/kernels/NECopyKernel.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NECOPYKERNEL_H
+#define ARM_COMPUTE_NECOPYKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform a copy between two tensors */
+class NECopyKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NECopyKernel";
+    }
+    /** Default constructor */
+    NECopyKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    NECopyKernel(const NECopyKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    NECopyKernel &operator=(const NECopyKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NECopyKernel(NECopyKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NECopyKernel &operator=(NECopyKernel &&) = default;
+    /** Default destructor */
+    ~NECopyKernel() = default;
+    /** Initialize the kernel's input, output.
+     *
+     * @param[in]  input   Source tensor. Data types supported: All
+     * @param[out] output  Destination tensor. Data types supported: same as @p input.
+     * @param[in]  padding (Optional) Padding to be applied to the input tensor
+     */
+    void configure(const ITensor *input, ITensor *output, const PaddingList &padding = PaddingList());
+    /** Static function to check if given info will lead to a valid configuration of @ref NECopyKernel
+     *
+     * @param[in] input   Source tensor. Data types supported: All
+     * @param[in] output  Destination tensor. Data types supported: same as @p input.
+     * @param[in] padding (Optional) Padding to be applied to the input tensor
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding = PaddingList());
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input;
+    ITensor       *_output;
+    PaddingList    _padding;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NECOPYKERNEL_H */
diff --git a/src/core/NEON/kernels/NECropKernel.cpp b/src/core/NEON/kernels/NECropKernel.cpp
index 5fb55d9..c94cdae 100644
--- a/src/core/NEON/kernels/NECropKernel.cpp
+++ b/src/core/NEON/kernels/NECropKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NECropKernel.h"
+#include "src/core/NEON/kernels/NECropKernel.h"
 
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
diff --git a/src/core/NEON/kernels/NECropKernel.h b/src/core/NEON/kernels/NECropKernel.h
new file mode 100644
index 0000000..742215e
--- /dev/null
+++ b/src/core/NEON/kernels/NECropKernel.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEON_CROP_KERNEL_H
+#define ARM_COMPUTE_NEON_CROP_KERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Interface for the kernel to perform tensor cropping */
+class NECropKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NECropKernel";
+    }
+    /** Default constructor */
+    NECropKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECropKernel(const NECropKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECropKernel &operator=(const NECropKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NECropKernel(NECropKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NECropKernel &operator=(NECropKernel &&) = default;
+    /** Default destructor */
+    ~NECropKernel() = default;
+    /** Configure kernel
+     *
+     * @note Supported tensor rank: up to 4
+     * @note Padding not supported.
+     *
+     * @param[in]  input               Source tensor. Data type supported: U8/U16/S16/U32/S32/F16/F32. Data layouts supported: NHWC.
+     * @param[in]  crop_boxes          Tensor containing all possible boxes used to crop the image, each represented by 4 normalized values.
+     *                                 Data type supported: F32
+     * @param[in]  box_ind             One dimensional tensor mapping the @p crop_box_ind to the index of the 3D image in @p input.
+     *                                 Data type supported: F32
+     * @param[out] output              Destination tensor. Data type supported: F32
+     * @param[in]  crop_box_ind        Index of the crop box to be used from @p crop_boxes. Default is 0.
+     * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
+     */
+    void configure(const ITensor *input, const ITensor *crop_boxes, const ITensor *box_ind, ITensor *output, uint32_t crop_box_ind = 0, float extrapolation_value = 0);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
+     *
+     * @note Supported tensor rank: up to 4
+     * @note Padding not supported.
+     *
+     * @param[in] input               Source tensor info. Data type supported: U8/U16/S16/U32/S32/F16/F32. Data layouts supported: NHWC.
+     * @param[in] crop_boxes          Tensor info for tensor containing all possible boxes used to crop the image. Data type supported: F32
+     * @param[in] box_ind             Tensor info for the one dimensional tensor mapping the @p crop_box_ind to the index of the 3D image
+     *                                in @p input. Data type supported: F32
+     * @param[in] output              Destination tensor. Data type supported: F32
+     * @param[in] crop_box_ind        Index of the crop box to be used from @p crop_boxes. Default is 0.
+     * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0.
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind = 0, float extrapolation_value = 0);
+
+    /** Configure output tensor's shape as this can only be determined at runtime. */
+    void configure_output_shape();
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+    /** Function to use for in bounds crop for the particular tensor types passed to configure() */
+    using InBoundsCropFunction = void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool);
+
+private:
+    const ITensor *_input;
+    const ITensor *_crop_boxes;
+    const ITensor *_box_ind;
+    ITensor       *_output;
+
+    Coordinates _start;
+    Coordinates _end;
+    uint32_t    _crop_box_ind;
+    float       _extrapolation_value;
+    /** The number of rows out of bounds at the start and end of output. */
+    std::array<uint32_t, 2> _rows_out_of_bounds;
+    /** The number of columns out of bounds at the start and end of output. */
+    std::array<uint32_t, 2> _cols_out_of_bounds;
+
+    NECropKernel::InBoundsCropFunction *_in_bounds_crop_function;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEON_CROP_KERNEL_H */
diff --git a/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
index 5628802..58a9a2f 100644
--- a/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
+++ b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h"
+#include "src/core/NEON/kernels/NECumulativeDistributionKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NECumulativeDistributionKernel.h b/src/core/NEON/kernels/NECumulativeDistributionKernel.h
new file mode 100644
index 0000000..1f8c65b
--- /dev/null
+++ b/src/core/NEON/kernels/NECumulativeDistributionKernel.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NECUMULATIVEDISTRIBUTIONKERNEL_H
+#define ARM_COMPUTE_NECUMULATIVEDISTRIBUTIONKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class IDistribution1D;
+class ILut;
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the cumulative distribution (cummulative summmation) calculation kernel.
+ *
+ * This kernel calculates the cumulative sum of a given distribution (meaning that each output element
+ * is the sum of all its previous elements including itself) and creates a lookup table with the normalized
+ * pixel intensities which is used for improve the constrast of the image.
+ */
+class NECumulativeDistributionKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NECumulativeDistributionKernel";
+    }
+    /** Default constructor */
+    NECumulativeDistributionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECumulativeDistributionKernel(const NECumulativeDistributionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECumulativeDistributionKernel &operator=(const NECumulativeDistributionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NECumulativeDistributionKernel(NECumulativeDistributionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NECumulativeDistributionKernel &operator=(NECumulativeDistributionKernel &&) = default;
+    /** Default destructor */
+    ~NECumulativeDistributionKernel() = default;
+    /** Set the input and output distribution.
+     *
+     * @param[in]  input          Input image. Data type supported: U8
+     * @param[in]  distribution   Unnormalized 256-bin distribution of the input image.
+     * @param[out] cumulative_sum Cummulative distribution (Summed histogram). Should be same size as @p distribution.
+     * @param[out] output         Equalization lookup table. Should consist of 256 entries of U8 elements.
+     */
+    void configure(const IImage *input, const IDistribution1D *distribution, IDistribution1D *cumulative_sum, ILut *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    bool is_parallelisable() const override;
+
+private:
+    const IImage          *_input;          /**< Input image. */
+    const IDistribution1D *_distribution;   /**< Input histogram of the input image. */
+    IDistribution1D       *_cumulative_sum; /**< The cummulative distribution. */
+    ILut                  *_output;         /**< Output with the equalization lookup table. */
+private:
+    static const uint32_t _histogram_size = 256; /**< Default histogram size of 256. */
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NECUMULATIVEDISTRIBUTIONKERNEL_H */
diff --git a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
index b500268..ba90bfc 100644
--- a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h"
+#include "src/core/NEON/kernels/NEDepthConcatenateLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.h b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.h
new file mode 100644
index 0000000..02c5479
--- /dev/null
+++ b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H
+#define ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Interface for the depth concatenate kernel.
+ *  The input tensor will be concatenated into the output tensor.
+ */
+class NEDepthConcatenateLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEDepthConcatenateLayerKernel";
+    }
+    /** Default constructor */
+    NEDepthConcatenateLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthConcatenateLayerKernel(const NEDepthConcatenateLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthConcatenateLayerKernel &operator=(const NEDepthConcatenateLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEDepthConcatenateLayerKernel(NEDepthConcatenateLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEDepthConcatenateLayerKernel &operator=(NEDepthConcatenateLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEDepthConcatenateLayerKernel() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]     input        Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]     depth_offset The offset on the Z axis.
+     * @param[in,out] output       Output tensor info. Data types supported: Same as @p input.
+     *
+     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
+     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
+     *
+     */
+    void configure(const ITensorInfo *input, unsigned int depth_offset, ITensorInfo *output);
+    /**  Static function to check if given info will lead to a valid configuration of @ref NEDepthConcatenateLayerKernel
+     *
+     * @param[in] input        Input tensor info. Data types supported:  QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] depth_offset The offset on the Z axis.
+     * @param[in] output       Output tensor info. Data types supported: Same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, unsigned int depth_offset, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+
+private:
+    using DepthConcatFunction = void(const ITensor *in, ITensor *out, unsigned int depth_offset, const Window &window);
+
+private:
+    DepthConcatFunction *_func;
+    unsigned int         _depth_offset;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
index 259ece7..d6c89a4 100644
--- a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h"
+#include "src/core/NEON/kernels/NEDepthConvertLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEDepthConvertLayerKernel.h b/src/core/NEON/kernels/NEDepthConvertLayerKernel.h
new file mode 100644
index 0000000..30fe1ed
--- /dev/null
+++ b/src/core/NEON/kernels/NEDepthConvertLayerKernel.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DEPTHCONVERTKERNEL_H
+#define ARM_COMPUTE_DEPTHCONVERTKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Depth conversion kernel
+ *  This function ignores the scale and zeroPoint of quanized tensors, i.e. QASYMM8 input is treated as uint8 values.
+ */
+class NEDepthConvertLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEDepthConvertLayerKernel";
+    }
+    /** Default constructor*/
+    NEDepthConvertLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthConvertLayerKernel(const NEDepthConvertLayerKernel &) = delete;
+    /** Default move constructor */
+    NEDepthConvertLayerKernel(NEDepthConvertLayerKernel &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthConvertLayerKernel &operator=(const NEDepthConvertLayerKernel &) = delete;
+    /** Default move assignment operator */
+    NEDepthConvertLayerKernel &operator=(NEDepthConvertLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEDepthConvertLayerKernel() = default;
+    /** Set the input and output of the kernel
+     *
+     * Valid conversions Input -> Output :
+     *
+     *   - QASYMM8_SIGNED -> S16, S32, F32, F16
+     *   - QASYMM8        -> U16, S16, S32, F32, F16
+     *   - U8             -> U16, S16, S32, F32, F16
+     *   - U16            -> U8, U32
+     *   - S16            -> QASYMM8_SIGNED, U8, S32
+     *   - BFLOAT16       -> F32
+     *   - F16            -> QASYMM8_SIGNED, QASYMM8, F32, S32, U8
+     *   - S32            -> QASYMM8_SIGNED, QASYMM8, F16, F32, U8
+     *   - F32            -> QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8
+     *
+     * @param[in]  input  The input tensor to convert. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/BFLOAT16/F16/F32.
+     * @param[out] output The output tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/U32/S32/BFLOAT16/F16/F32.
+     * @param[in]  policy Conversion policy.
+     * @param[in]  shift  (Optional) Value for down/up conversions. Must be 0 <= shift < 8.
+     */
+    void configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift = 0);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEDepthConvertLayerKernel
+     *
+     * @param[in] input  Source tensor info. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/BFLOAT16/F16/F32.
+     * @param[in] output Destination tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/U32/S32/BFLOAT16/F16/F32.
+     * @param[in] policy Conversion policy
+     * @param[in] shift  (Optional) Value for down/up conversions. Must be 0 <= shift < 8.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift = 0);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input;
+    ITensor       *_output;
+    ConvertPolicy  _policy;
+    uint32_t       _shift;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEDEPTHCONVERTKERNEL_H */
diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
index 403e7aa..6dcc85e 100644
--- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
+#include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
new file mode 100644
index 0000000..7e18dd8
--- /dev/null
+++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H
+#define ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Interface for the depth to space kernel */
+class NEDepthToSpaceLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEDepthToSpaceLayerKernel";
+    }
+    /** Default constructor */
+    NEDepthToSpaceLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthToSpaceLayerKernel(const NEDepthToSpaceLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthToSpaceLayerKernel &operator=(const NEDepthToSpaceLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEDepthToSpaceLayerKernel(NEDepthToSpaceLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEDepthToSpaceLayerKernel &operator=(NEDepthToSpaceLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEDepthToSpaceLayerKernel() = default;
+    /** Initialise the kernel's inputs and output.
+     *
+     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All
+     * @param[out] output      Tensor output. Data types supported: same as @p input
+     * @param[in]  block_shape Block shape x value.
+     */
+    void configure(const ITensor *input, ITensor *output, int32_t block_shape);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEDepthToSpaceLayerKernel.
+     *
+     * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All
+     * @param[in] output      Tensor output info. Data types supported: same as @p input
+     * @param[in] block_shape Block shape value.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input;       /**< Source tensor */
+    ITensor       *_output;      /**< Destination tensor */
+    int32_t        _block_shape; /**< Block shape */
+    DataLayout     _data_layout; /**< Data layout of the operation */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
index 533b374..6e5322c 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
+#include "src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
 
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h
new file mode 100644
index 0000000..713cdcd
--- /dev/null
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
+#define ARM_COMPUTE_NEDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
+
+#include "arm_compute/core/utils/misc/Traits.h"
+#include "src/core/NEON/INEKernel.h"
+#include "support/Requires.h"
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#include <arm_neon.h>
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Interface for the kernel to run a depthwise convolution native on a tensor. */
+class NEDepthwiseConvolutionLayerNativeKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEDepthwiseConvolutionLayerNativeKernel";
+    }
+    /** Default constructor */
+    NEDepthwiseConvolutionLayerNativeKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthwiseConvolutionLayerNativeKernel(const NEDepthwiseConvolutionLayerNativeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthwiseConvolutionLayerNativeKernel &operator=(const NEDepthwiseConvolutionLayerNativeKernel &) = delete;
+    /** Default Move Constructor. */
+    NEDepthwiseConvolutionLayerNativeKernel(NEDepthwiseConvolutionLayerNativeKernel &&) = default;
+    /** Default move assignment operator */
+    NEDepthwiseConvolutionLayerNativeKernel &operator=(NEDepthwiseConvolutionLayerNativeKernel &&) = default;
+    /** Default destructor */
+    ~NEDepthwiseConvolutionLayerNativeKernel() = default;
+    /** Initialize the function's source, destination and parameters.
+     *
+     * @note Supported data layouts: NHWC
+     *
+     * @param[in]  input            Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  weights          Weights tensor. This is a 3D tensor with dimensions [IFM, W, H].
+     *                              Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  biases           Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                              Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+     * @param[out] output           Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  conv_info        Padding and stride information to use for the convolution.
+     * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     *
+     */
+    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
+                   const Size2D &dilation = Size2D(1U, 1U));
+    /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayerNativeKernel
+     *
+     * @note Supported data layouts: NHWC
+     *
+     * @param[in] input            Source tensor info. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] weights          Weights tensor info. This is a 3D tensor with dimensions [IFM, W, H].
+     *                             Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] biases           Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                             Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] output           Destination tensor info. Data type supported: Same as @p input.
+     * @param[in] conv_info        Padding and stride information to use for the convolution.
+     * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
+                           const Size2D &dilation = Size2D(1U, 1U));
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    template <typename T>
+    using FloatEnalber = typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, int>::type;
+
+    template <typename T, typename TW, FloatEnalber<T> = 0>
+    void run_depthwise(const Window &window, bool has_biases);
+
+    template <typename T>
+    using Quantized8bitEnalber = typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int >::type;
+
+    template <typename T, typename TW, Quantized8bitEnalber<T> = 0>
+    void run_depthwise(const Window &window, bool has_biases);
+
+    /** Common signature for all the specialised depthwise convolution native functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using DepthwiseFunctionPtr = void (NEDepthwiseConvolutionLayerNativeKernel::*)(const Window &window, bool has_biases);
+
+    DepthwiseFunctionPtr _func;
+    const ITensor       *_input;
+    const ITensor       *_weights;
+    const ITensor       *_biases;
+    ITensor             *_output;
+    PadStrideInfo        _conv_info;
+    unsigned int         _depth_multiplier;
+    Size2D               _dilation;
+    std::vector<int>     _output_multiplier;
+    std::vector<int>     _output_shift;
+    bool                 _has_biases;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
index 2f3c6f4..36e9c92 100644
--- a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h"
+#include "src/core/NEON/kernels/NEDequantizationLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEDequantizationLayerKernel.h b/src/core/NEON/kernels/NEDequantizationLayerKernel.h
new file mode 100644
index 0000000..9cc7192
--- /dev/null
+++ b/src/core/NEON/kernels/NEDequantizationLayerKernel.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEDEQUANTIZATIONLAYERKERNEL_H
+#define ARM_COMPUTE_NEDEQUANTIZATIONLAYERKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the dequantization layer kernel. */
+class NEDequantizationLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEDequantizationLayerKernel";
+    }
+    /** Default constructor */
+    NEDequantizationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDequantizationLayerKernel(const NEDequantizationLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDequantizationLayerKernel &operator=(const NEDequantizationLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    NEDequantizationLayerKernel(NEDequantizationLayerKernel &&) = default;
+    /** Default move assignment operator */
+    NEDequantizationLayerKernel &operator=(NEDequantizationLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEDequantizationLayerKernel() = default;
+    /** Set input, output tensors.
+     *
+     * @param[in]  input  Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     * @param[out] output Destination tensor with the same dimensions of input. Data type supported: F16/F32.
+     */
+    void configure(const ITensor *input, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEDequantizationLayerKernel
+     *
+     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     * @param[in] output Output tensor info. Data types supported: F16/F32.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input;
+    ITensor       *_output;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEDEQUANTIZATIONLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEDerivativeKernel.cpp b/src/core/NEON/kernels/NEDerivativeKernel.cpp
index 5d3fc01..8d641a3 100644
--- a/src/core/NEON/kernels/NEDerivativeKernel.cpp
+++ b/src/core/NEON/kernels/NEDerivativeKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEDerivativeKernel.h"
+#include "src/core/NEON/kernels/NEDerivativeKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEDerivativeKernel.h b/src/core/NEON/kernels/NEDerivativeKernel.h
new file mode 100644
index 0000000..112b2b0
--- /dev/null
+++ b/src/core/NEON/kernels/NEDerivativeKernel.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEDERIVATIVEKERNEL_H
+#define ARM_COMPUTE_NEDERIVATIVEKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to run the derivative along the X/Y directions on a tensor.
+ *
+ */
+class NEDerivativeKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEDerivativeKernel";
+    }
+    /** Default constructor */
+    NEDerivativeKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDerivativeKernel(const NEDerivativeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDerivativeKernel &operator=(const NEDerivativeKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEDerivativeKernel(NEDerivativeKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEDerivativeKernel &operator=(NEDerivativeKernel &&) = default;
+    /** Default destructor */
+    ~NEDerivativeKernel() = default;
+    /** Initialise the kernel's sources, destination and border
+     *
+     * @note At least one of output_x or output_y must be set
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Function to perform derivative along the X direction on the given window
+     *
+     * @param[in] window Region on which to execute the kernel
+     */
+    void derivative_x(const Window &window);
+    /** Function to perform derivative along the Y direction on the given window
+     *
+     * @param[in] window Region on which to execute the kernel
+     */
+    void derivative_y(const Window &window);
+    /** Function to perform derivative along the X and Y direction on the given window
+     *
+     * @param[in] window Region on which to execute the kernel
+     */
+    void derivative_xy(const Window &window);
+    /** Common signature for all the specialised derivative functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using DerivativeFunction = void (NEDerivativeKernel::*)(const Window &window);
+    /** Derivative function to use for the particular tensor types passed to configure() */
+    DerivativeFunction _func;
+
+private:
+    const ITensor *_input;    /**< Input tensor */
+    ITensor       *_output_x; /**< Output tensor - Derivate along the X direction */
+    ITensor       *_output_y; /**< Output tensor - Derivate along the Y direction */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEDERIVATIVEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEDilateKernel.cpp b/src/core/NEON/kernels/NEDilateKernel.cpp
index cc781c6..dc9ec22 100644
--- a/src/core/NEON/kernels/NEDilateKernel.cpp
+++ b/src/core/NEON/kernels/NEDilateKernel.cpp
@@ -21,13 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEDilateKernel.h"
+#include "src/core/NEON/kernels/NEDilateKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/INEKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
diff --git a/src/core/NEON/kernels/NEDilateKernel.h b/src/core/NEON/kernels/NEDilateKernel.h
new file mode 100644
index 0000000..f1d3431
--- /dev/null
+++ b/src/core/NEON/kernels/NEDilateKernel.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEDILATEKERNEL_H
+#define ARM_COMPUTE_NEDILATEKERNEL_H
+
+#include "src/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform boolean image dilatation */
+class NEDilateKernel : public INESimpleKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEDilateKernel";
+    }
+    /** Default constructor */
+    NEDilateKernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDilateKernel(const NEDilateKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDilateKernel &operator=(const NEDilateKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEDilateKernel(NEDilateKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEDilateKernel &operator=(NEDilateKernel &&) = default;
+    /** Default destructor */
+    ~NEDilateKernel() = default;
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8
+     * @param[out] output           Destination tensor. Data type supported: U8
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEDILATEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index 56cd6e6..87b9fb1 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
+#include "src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
 
 #include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
 #include "src/core/NEON/wrapper/wrapper.h"
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
new file mode 100644
index 0000000..94c97cf
--- /dev/null
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H
+#define ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON interface for Direct Convolution Layer kernel */
+class NEDirectConvolutionLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEDirectConvolutionLayerKernel";
+    }
+    /** Default constructor */
+    NEDirectConvolutionLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDirectConvolutionLayerKernel(const NEDirectConvolutionLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDirectConvolutionLayerKernel &operator=(const NEDirectConvolutionLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEDirectConvolutionLayerKernel(NEDirectConvolutionLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEDirectConvolutionLayerKernel &operator=(NEDirectConvolutionLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEDirectConvolutionLayerKernel() = default;
+    /** Set the input, weights, and output tensors.
+     *
+     * @note: DirectConvolution only works in the following configurations:
+     *        1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
+     *        3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3
+     *
+     * @param[in]  input     The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
+     *                       while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
+     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                       The 3rd dimension must be the same as the input's volume 3rd dimension.
+     *                       Data type supported:Same as @p input.
+     * @param[out] output    Output tensor.
+     *                       The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32
+     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     */
+    void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayerKernel
+     *
+     * @param[in] input     The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
+     *                      while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
+     * @param[in] weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                      The 3rd dimension must be the same as the input's volume 3rd dimension.
+     *                      Data type supported:Same as @p input.
+     * @param[in] output    Output tensor.
+     *                      The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32
+     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+private:
+    /* Template function for optimized convolution NHWC */
+    template <typename T>
+    void convolve_nhwc_optimized(const Window &window);
+
+    /* Template function for convolution NHWC */
+    template <typename T>
+    void convolve_nhwc(const Window &window);
+
+    const ITensor *_input;
+    const ITensor *_weights;
+    ITensor       *_output;
+    PadStrideInfo  _conv_info;
+    BorderSize     _border_size;
+    unsigned int   _kernel_size;
+    unsigned int   _num_weight_elems_read_per_row;
+    unsigned int   _num_elems_read_per_iteration;
+    unsigned int   _num_elems_written_per_iteration;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index abaaf12..de5a88e 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
new file mode 100644
index 0000000..b1b8810
--- /dev/null
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYEROUTPUTSTAGEKERNEL_H
+#define ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYEROUTPUTSTAGEKERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+/** NEON kernel to accumulate the biases, if provided, or downscale in case of quantized input.
+ *
+ * @note We assume bias to be shared
+ * @note For quantized computations (i.e. @p input of S32 type) the output data type for auto-initialization must be passed as part
+ *       of the @ref DirectConvolutionLayerOutputStageKernelInfo.
+ */
+class NEDirectConvolutionLayerOutputStageKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEDirectConvolutionLayerOutputStageKernel";
+    }
+    /** Default constructor */
+    NEDirectConvolutionLayerOutputStageKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDirectConvolutionLayerOutputStageKernel(const NEDirectConvolutionLayerOutputStageKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDirectConvolutionLayerOutputStageKernel &operator=(const NEDirectConvolutionLayerOutputStageKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEDirectConvolutionLayerOutputStageKernel(NEDirectConvolutionLayerOutputStageKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEDirectConvolutionLayerOutputStageKernel &operator=(NEDirectConvolutionLayerOutputStageKernel &&) = default;
+    /** Default destructor */
+    ~NEDirectConvolutionLayerOutputStageKernel() = default;
+    /** Set the accumulate buffer and the biases of the kernel.
+     *
+     * @param[in, out] input  Input to add the bias to. If @p output is not specified then accumulation is done in-place.
+     *                        Data type supported: F16/F32/S32
+     * @param[in]      bias   (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
+     * @param[out]     output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
+     *                        Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr.
+     *                        Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p input is S32
+     * @param[in]      info   (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata
+     */
+    void configure(ITensor *input, const ITensor *bias = nullptr, ITensor *output = nullptr,
+                   const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayerOutputStageKernel
+     *
+     * @param[in] input  Input to add the bias to. If @p output is not specified then accumulation is done in-place.
+     *                   Data type supported: F16/F32/S32
+     * @param[in] bias   (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
+     * @param[in] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
+     *                   Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr.
+     *                   Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p input is S32
+     * @param[in] info   (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *bias = nullptr, const ITensorInfo *output = nullptr,
+                           const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    using OutputStageKernel = void(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+                                   int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, bool has_bias);
+
+private:
+    OutputStageKernel *_func;
+    ITensor           *_input;
+    const ITensor     *_bias;
+    ITensor           *_output;
+    int                _result_fixedpoint_multiplier;
+    int                _result_shift;
+    int                _result_offset_after_shift;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYEROUTPUTSTAGEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp b/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp
index efe6161..bb4e9a6 100644
--- a/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h"
+#include "src/core/NEON/kernels/NEElementwiseOperationKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
diff --git a/src/core/NEON/kernels/NEElementwiseOperationKernel.h b/src/core/NEON/kernels/NEElementwiseOperationKernel.h
new file mode 100644
index 0000000..b0037d3
--- /dev/null
+++ b/src/core/NEON/kernels/NEElementwiseOperationKernel.h
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEELEMENTWISEOPERATIONKERNEL_H
+#define ARM_COMPUTE_NEELEMENTWISEOPERATIONKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for an element-wise operation kernel
+ *
+ * Element-wise operation is computed by:
+ * @f[ output(x,y) = OP(input1(x,y), input2(x,y))@f]
+ *
+ */
+class NEElementwiseOperationKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEElementwiseOperationKernel";
+    }
+    /** Default constructor */
+    NEElementwiseOperationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseOperationKernel(const NEElementwiseOperationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseOperationKernel &operator=(const NEElementwiseOperationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEElementwiseOperationKernel(NEElementwiseOperationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEElementwiseOperationKernel &operator=(NEElementwiseOperationKernel &&) = default;
+    /** Default destructor */
+    ~NEElementwiseOperationKernel() = default;
+
+    /** Common signature for all the specialised arithmetic functions
+     *
+     * @param[in]  input1 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
+     * @param[in]  input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[out] output Output tensor info. Data types supported: Dependent on subclass.
+     * @param[in]  window Region on which to execute the kernel.
+     */
+    using ElementwiseFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+
+protected:
+    /** Validate the argument passed to the kernel
+     *
+     * @param[in] input1 First tensor input. Data types supported: QASYMM8/S16/F16/S32/F32.
+     * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
+     * @param[in] output Output tensor. Data types supported: Dependent on subclass.
+     */
+    static Status validate_arguments_common(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output);
+
+    /** Commmon configure function for element-wise operators with no additional options (e.g. Min, Max, SquaredDiff)
+     *
+     */
+    void configure_common(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+
+    /** Function to use for the particular tensor types passed to configure() */
+    std::function<void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window)> _function;
+
+    const ITensor *_input1;
+    const ITensor *_input2;
+    ITensor       *_output;
+};
+
+class NEArithmeticOperationKernel : public NEElementwiseOperationKernel
+{
+public:
+    /** Default constructor */
+    NEArithmeticOperationKernel() = default;
+
+    /** Configure kernel
+     *
+     * @param[in]  op     Arithmetic operation to be executed.
+     * @param[in]  input1 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
+     * @param[in]  input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[out] output Output tensor info. Data types supported: Same as @p input1.
+     */
+    void configure(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
+     *
+     * @param[in] op     Arithmetic operation to be executed.
+     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     *
+     * @return a Status
+     */
+    static Status validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+
+protected:
+    // Inherited methods overridden:
+    static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output);
+};
+
+class NEDivisionOperationKernel : public NEArithmeticOperationKernel
+{
+public:
+    /** Default constructor */
+    NEDivisionOperationKernel() = default;
+
+    /** Configure kernel
+     *
+     * @param[in]  input1 First tensor input info. Data types supported: S32/F16/F32.
+     * @param[in]  input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[out] output Output tensor info. Data types supported: Same as @p input1.
+     */
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEDivisionOperationKernel
+     *
+     * @param[in] input1 First tensor input info. Data types supported: S32/F16/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     *
+     * @return a Status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+
+protected:
+    // Inherited methods overridden:
+    static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output);
+};
+
+class NEPowerOperationKernel : public NEArithmeticOperationKernel
+{
+public:
+    /** Default constructor */
+    NEPowerOperationKernel() = default;
+
+    /** Configure kernel
+     *
+     * @param[in]  input1 First tensor input info. Data types supported: F16/F32.
+     * @param[in]  input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[out] output Output tensor info. Data types supported: Same as @p input1.
+     */
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEPowerOperationKernel
+     *
+     * @param[in] input1 First tensor input info. Data types supported: F16/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     *
+     * @return a Status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+
+protected:
+    // Inherited methods overridden:
+    static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output);
+};
+
+class NEComparisonOperationKernel : public NEElementwiseOperationKernel
+{
+public:
+    /** Default constructor */
+    NEComparisonOperationKernel() = default;
+
+    /** Configure kernel
+     *
+     * @param[in]  op     Comparison operation to be executed.
+     * @param[in]  input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in]  input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[out] output Output tensor info. Data types supported: U8.
+     */
+    void configure(ComparisonOperation op, const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEComparisonOperationKernel
+     *
+     * @param[in] op     Comparison operation to be executed.
+     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: U8.
+     *
+     * @return a Status
+     */
+    static Status validate(ComparisonOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+
+protected:
+    // Inherited methods overridden:
+    static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output);
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEELEMENTWISEOPERATIONKERNEL_H */
diff --git a/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp b/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp
index 8e4b7ed..d899643 100644
--- a/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp
+++ b/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h"
+#include "src/core/NEON/kernels/NEElementwiseUnaryKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEElementwiseUnaryKernel.h b/src/core/NEON/kernels/NEElementwiseUnaryKernel.h
new file mode 100644
index 0000000..fcf0aa5
--- /dev/null
+++ b/src/core/NEON/kernels/NEElementwiseUnaryKernel.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEELEMENTWISEUNARYKERNEL_H
+#define ARM_COMPUTE_NEELEMENTWISEUNARYKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for an element-wise unary operation kernel
+ *
+ * Element-wise operation is computed by:
+ * @f[ output(x) = OP(input(x))@f]
+ *
+ */
+class NEElementwiseUnaryKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEElementwiseUnaryKernel";
+    }
+    /** Default constructor */
+    NEElementwiseUnaryKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseUnaryKernel(const NEElementwiseUnaryKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseUnaryKernel &operator=(const NEElementwiseUnaryKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEElementwiseUnaryKernel(NEElementwiseUnaryKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEElementwiseUnaryKernel &operator=(NEElementwiseUnaryKernel &&) = default;
+    /** Default destructor */
+    ~NEElementwiseUnaryKernel() = default;
+
+    /** Function to configure the @ref NEElementwiseUnaryKernel
+     *
+     * @param[in]  op     Arithmetic operation to be executed.
+     * @param[in]  input  First tensor input. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations.
+     * @param[out] output Output tensor. Data types supported: Same as @p input.
+     */
+    void configure(ElementWiseUnary op, const ITensor *input, ITensor *output);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEElementwiseUnaryKernel
+     *
+     * @param[in] op     Arithmetic operation to be executed.
+     * @param[in] input  First tensor input info. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input.
+     *
+     * @return a Status
+     */
+    static Status validate(ElementWiseUnary op, const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Common signature for all the specialised arithmetic functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using ElementwiseUnaryPtr = void (NEElementwiseUnaryKernel::*)(const Window &window);
+
+    /** Template function to run elementwise unary operation
+     *
+     * @tparam ScalarType Scalar datatype
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename ScalarType>
+    void elementwise_op(const Window &window);
+
+    ElementwiseUnaryPtr _func;
+    const ITensor      *_input;
+    ITensor            *_output;
+    ElementWiseUnary    _op;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEELEMENTWISEUNARYKERNEL_H */
diff --git a/src/core/NEON/kernels/NEErodeKernel.cpp b/src/core/NEON/kernels/NEErodeKernel.cpp
index 31b0f48..171a6c8 100644
--- a/src/core/NEON/kernels/NEErodeKernel.cpp
+++ b/src/core/NEON/kernels/NEErodeKernel.cpp
@@ -21,13 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEErodeKernel.h"
+#include "src/core/NEON/kernels/NEErodeKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/INEKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
diff --git a/src/core/NEON/kernels/NEErodeKernel.h b/src/core/NEON/kernels/NEErodeKernel.h
new file mode 100644
index 0000000..54f2867
--- /dev/null
+++ b/src/core/NEON/kernels/NEErodeKernel.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEERODEKERNEL_H
+#define ARM_COMPUTE_NEERODEKERNEL_H
+
+#include "src/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform boolean image erosion */
+class NEErodeKernel : public INESimpleKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEErodeKernel";
+    }
+    /** Default constructor */
+    NEErodeKernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEErodeKernel(const NEErodeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEErodeKernel &operator=(const NEErodeKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEErodeKernel(NEErodeKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEErodeKernel &operator=(NEErodeKernel &&) = default;
+    /** Default destructor */
+    ~NEErodeKernel() = default;
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8
+     * @param[out] output           Destination tensor. Data type supported: U8
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEERODEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
index d8036f2..200ee6b 100644
--- a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h"
+#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
diff --git a/src/core/NEON/kernels/NEFFTDigitReverseKernel.h b/src/core/NEON/kernels/NEFFTDigitReverseKernel.h
new file mode 100644
index 0000000..f436c36
--- /dev/null
+++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEFFTDIGITREVERSEKERNEL_H
+#define ARM_COMPUTE_NEFFTDIGITREVERSEKERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Interface for the digit reverse operation kernel. */
+class NEFFTDigitReverseKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEFFTDigitReverseKernel";
+    }
+    /** Constructor */
+    NEFFTDigitReverseKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFFTDigitReverseKernel(const NEFFTDigitReverseKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFFTDigitReverseKernel &operator=(const NEFFTDigitReverseKernel &) = delete;
+    /** Default Move Constructor. */
+    NEFFTDigitReverseKernel(NEFFTDigitReverseKernel &&) = default;
+    /** Default move assignment operator */
+    NEFFTDigitReverseKernel &operator=(NEFFTDigitReverseKernel &&) = default;
+    /** Default destructor */
+    ~NEFFTDigitReverseKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: F32. Number of channels supported: 1 (real tensor) or 2 (complex tensor).
+     * @param[out] output Destination tensor. Data type supported: same as @p input. Number of channels supported: 2 (complex tensor).
+     * @param[in]  idx    Digit reverse index tensor. Data type supported: U32
+     * @param[in]  config Kernel configuration.
+     */
+    void configure(const ITensor *input, ITensor *output, const ITensor *idx, const FFTDigitReverseKernelInfo &config);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEFFTDigitReverseKernel
+     *
+     * @param[in] input  Source tensor info. Data types supported: F32. Number of channels supported: 1 (real tensor) or 2 (complex tensor).
+     * @param[in] output Destination tensor info. Data type supported: same as @p input. Number of channels supported: 2 (complex tensor).
+     * @param[in] idx    Digit reverse index tensor info. Data type supported: U32
+     * @param[in] config Kernel configuration
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    using NEFFTDigitReverseKernelFunctionPtr = void (NEFFTDigitReverseKernel::*)(const Window &window);
+
+    template <bool is_input_complex, bool is_conj>
+    void digit_reverse_kernel_axis_0(const Window &window);
+
+    template <bool is_input_complex, bool is_conj>
+    void digit_reverse_kernel_axis_1(const Window &window);
+
+    NEFFTDigitReverseKernelFunctionPtr _func;
+    const ITensor                     *_input;
+    ITensor                           *_output;
+    const ITensor                     *_idx;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEFFTDIGITREVERSEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
index 1b0af48..cb1391a 100644
--- a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h"
+#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.h b/src/core/NEON/kernels/NEFFTRadixStageKernel.h
new file mode 100644
index 0000000..8a695b7
--- /dev/null
+++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEFFTRADIXSTAGEKERNEL_H
+#define ARM_COMPUTE_NEFFTRADIXSTAGEKERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/NEON/INEKernel.h"
+
+#include <arm_neon.h>
+#include <set>
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Interface for the FFT kernel. */
+class NEFFTRadixStageKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEFFTRadixStageKernel";
+    }
+    /** Constructor */
+    NEFFTRadixStageKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFFTRadixStageKernel(const NEFFTRadixStageKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFFTRadixStageKernel &operator=(const NEFFTRadixStageKernel &) = delete;
+    /** Default Move Constructor. */
+    NEFFTRadixStageKernel(NEFFTRadixStageKernel &&) = default;
+    /** Default move assignment operator */
+    NEFFTRadixStageKernel &operator=(NEFFTRadixStageKernel &&) = default;
+    /** Default destructor */
+    ~NEFFTRadixStageKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @note If the output tensor is nullptr, the FFT will be performed in-place
+     *
+     * @param[in,out] input  Source tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
+     * @param[out]    output Destination tensor. Data type supported: same as @p input. Number of channels supported: same as @p input.
+     * @param[in]     config FFT descriptor metadata.
+     */
+    void configure(ITensor *input, ITensor *output, const FFTRadixStageKernelInfo &config);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEFFTRadixStageKernel
+     *
+     * @param[in] input  Source tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor).
+     * @param[in] output Destination tensor info. Data type supported: same as @p input. Number of channels supported: same as @p input.
+     * @param[in] config FFT descriptor metadata.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config);
+    /** Returns the radix that are support by the FFT kernel
+     *
+     * @return A set of supported radix
+     */
+    static std::set<unsigned int> supported_radix();
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    ITensor     *_input;
+    ITensor     *_output;
+    bool         _run_in_place;
+    unsigned int _Nx;
+    unsigned int _axis;
+    unsigned int _radix;
+
+    void set_radix_stage_axis0(const FFTRadixStageKernelInfo &config);
+    void set_radix_stage_axis1(const FFTRadixStageKernelInfo &config);
+
+    using FFTFunctionPointerAxis0 = std::function<void(float *, float *, unsigned int, unsigned int, const float32x2_t &, unsigned int)>;
+    using FFTFunctionPointerAxis1 = std::function<void(float *, float *, unsigned int, unsigned int, const float32x2_t &, unsigned int, unsigned int)>;
+
+    FFTFunctionPointerAxis0 _func_0;
+    FFTFunctionPointerAxis1 _func_1;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEFFTRADIXSTAGEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEFFTScaleKernel.cpp b/src/core/NEON/kernels/NEFFTScaleKernel.cpp
index 0cb8b84..6dc5541 100644
--- a/src/core/NEON/kernels/NEFFTScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTScaleKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEFFTScaleKernel.h"
+#include "src/core/NEON/kernels/NEFFTScaleKernel.h"
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
diff --git a/src/core/NEON/kernels/NEFFTScaleKernel.h b/src/core/NEON/kernels/NEFFTScaleKernel.h
new file mode 100644
index 0000000..24a19f9
--- /dev/null
+++ b/src/core/NEON/kernels/NEFFTScaleKernel.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEFFTSCALEKERNEL_H
+#define ARM_COMPUTE_NEFFTSCALEKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Interface for the inverse fft scale kernel. */
+class NEFFTScaleKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEFFTScaleKernel";
+    }
+    /** Constructor */
+    NEFFTScaleKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFFTScaleKernel(const NEFFTScaleKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFFTScaleKernel &operator=(const NEFFTScaleKernel &) = delete;
+    /** Default Move Constructor. */
+    NEFFTScaleKernel(NEFFTScaleKernel &&) = default;
+    /** Default move assignment operator */
+    NEFFTScaleKernel &operator=(NEFFTScaleKernel &&) = default;
+    /** Default destructor */
+    ~NEFFTScaleKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in,out] input  Source tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
+     * @param[out]    output Destination tensor. Data type supported: same as @p input. Number of channels supported: 1 (real tensor) or 2 (complex tensor).
+     * @param[in]     config Kernel configuration
+     */
+    void configure(ITensor *input, ITensor *output, const FFTScaleKernelInfo &config);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEFFTScaleKernel
+     *
+     * @param[in] input  Source tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor).
+     * @param[in] output Destination tensor info. Data type supported: same as @p input. Number of channels supported: 1 (real tensor) or 2 (complex tensor).
+     * @param[in] config Kernel configuration
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const FFTScaleKernelInfo &config);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    ITensor *_input;
+    ITensor *_output;
+    float    _scale;
+    bool     _run_in_place;
+    bool     _is_conj;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEFFTSCALEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEFastCornersKernel.cpp b/src/core/NEON/kernels/NEFastCornersKernel.cpp
index 99312f5..c9280d8 100644
--- a/src/core/NEON/kernels/NEFastCornersKernel.cpp
+++ b/src/core/NEON/kernels/NEFastCornersKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEFastCornersKernel.h"
+#include "src/core/NEON/kernels/NEFastCornersKernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
diff --git a/src/core/NEON/kernels/NEFastCornersKernel.h b/src/core/NEON/kernels/NEFastCornersKernel.h
new file mode 100644
index 0000000..a4086af
--- /dev/null
+++ b/src/core/NEON/kernels/NEFastCornersKernel.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEFASTCORNERSKERNEL_H
+#define ARM_COMPUTE_NEFASTCORNERSKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** NEON kernel to perform fast corners */
+class NEFastCornersKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEFastCornersKernel";
+    }
+    /** Constructor */
+    NEFastCornersKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFastCornersKernel(const NEFastCornersKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFastCornersKernel &operator=(const NEFastCornersKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEFastCornersKernel(NEFastCornersKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEFastCornersKernel &operator=(NEFastCornersKernel &&) = default;
+    /** Default destructor */
+    ~NEFastCornersKernel() = default;
+    /** Initialise the kernel.
+     *
+     * @param[in]  input               Source image. Data type supported: U8.
+     * @param[out] output              Output image. Data type supported: U8.
+     * @param[in]  threshold           Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
+     * @param[in]  non_max_suppression True if non-maxima suppresion is applied, false otherwise.
+     * @param[in]  border_undefined    True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const IImage *input, IImage *output, uint8_t threshold, bool non_max_suppression, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+private:
+    const IImage *_input;               /**< source image */
+    IImage       *_output;              /**< inermediate results */
+    uint8_t       _threshold;           /**< threshold on difference between intensity */
+    bool          _non_max_suppression; /** true if non-maxima suppression is applied in the next stage */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEFASTCORNERSKERNEL_H */
diff --git a/src/core/NEON/kernels/NEFillArrayKernel.cpp b/src/core/NEON/kernels/NEFillArrayKernel.cpp
index 93798db..e8ae926 100644
--- a/src/core/NEON/kernels/NEFillArrayKernel.cpp
+++ b/src/core/NEON/kernels/NEFillArrayKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEFillArrayKernel.h"
+#include "src/core/NEON/kernels/NEFillArrayKernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
diff --git a/src/core/NEON/kernels/NEFillArrayKernel.h b/src/core/NEON/kernels/NEFillArrayKernel.h
new file mode 100644
index 0000000..c984167
--- /dev/null
+++ b/src/core/NEON/kernels/NEFillArrayKernel.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEFILLARRAYKERNEL_H
+#define ARM_COMPUTE_NEFILLARRAYKERNEL_H
+
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** This kernel adds all texels greater than or equal to the threshold value to the keypoint array. */
+class NEFillArrayKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEFillArrayKernel";
+    }
+    /** Default contructor */
+    NEFillArrayKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFillArrayKernel(const NEFillArrayKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFillArrayKernel &operator=(const NEFillArrayKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEFillArrayKernel(NEFillArrayKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEFillArrayKernel &operator=(NEFillArrayKernel &&) = default;
+    /** Default detructor */
+    ~NEFillArrayKernel() = default;
+
+    /** Initialise the kernel.
+     *
+     * @param[in]  input     Source image. Data type supported: U8.
+     * @param[in]  threshold Texels greater than the threshold will be added to the array.
+     * @param[out] output    Arrays of keypoints to store the results.
+     */
+    void configure(const IImage *input, uint8_t threshold, IKeyPointArray *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    bool is_parallelisable() const override;
+
+private:
+    const IImage   *_input;
+    IKeyPointArray *_output;
+    uint8_t         _threshold;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEFILLARRAYKERNEL_H*/
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index c1dd5cf..4880790 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.h b/src/core/NEON/kernels/NEFillBorderKernel.h
new file mode 100644
index 0000000..65908be
--- /dev/null
+++ b/src/core/NEON/kernels/NEFillBorderKernel.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEFILLBORDERKERNEL_H
+#define ARM_COMPUTE_NEFILLBORDERKERNEL_H
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Interface for the kernel to fill borders */
+class NEFillBorderKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEFillBorderKernel";
+    }
+    /** Default Constructor */
+    NEFillBorderKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFillBorderKernel(const NEFillBorderKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFillBorderKernel &operator=(const NEFillBorderKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEFillBorderKernel(NEFillBorderKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEFillBorderKernel &operator=(NEFillBorderKernel &&) = default;
+    /** Default destructor */
+    ~NEFillBorderKernel() = default;
+
+    /** Initialise the function.
+     *
+     * @note This kernel fills the borders within the XY-planes.
+     *
+     * @param[in,out] tensor                Tensor to process. Data types supported: All.
+     * @param[in]     border_size           Size of the border to fill in elements.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    void fill_replicate_single_channel(const Window &window);
+    void fill_constant_value_single_channel(const Window &window);
+
+    ITensor   *_tensor;
+    BorderSize _border_size;
+    BorderMode _mode;
+    PixelValue _constant_border_value;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEFILLBORDERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEFlattenLayerKernel.cpp b/src/core/NEON/kernels/NEFlattenLayerKernel.cpp
index e6b34b6..8c0dc10 100644
--- a/src/core/NEON/kernels/NEFlattenLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEFlattenLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
+#include "src/core/NEON/kernels/NEFlattenLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEFlattenLayerKernel.h b/src/core/NEON/kernels/NEFlattenLayerKernel.h
new file mode 100644
index 0000000..5fd5f43
--- /dev/null
+++ b/src/core/NEON/kernels/NEFlattenLayerKernel.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEFLATTENLAYERKERNEL_H
+#define ARM_COMPUTE_NEFLATTENLAYERKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the flatten layer kernel. */
+class NEFlattenLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEFlattenLayerKernel";
+    }
+    /** Default constructor */
+    NEFlattenLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFlattenLayerKernel(const NEFlattenLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFlattenLayerKernel &operator=(const NEFlattenLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEFlattenLayerKernel(NEFlattenLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEFlattenLayerKernel &operator=(NEFlattenLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEFlattenLayerKernel() = default;
+
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input  First input tensor to flatten with at least 3 dimensions.
+     *                    The dimensions above the third will be interpreted as batches. Data types supported: All
+     * @param[out] output Output tensor with shape [w*h*d, input_batches] where:
+     *                    w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input
+     */
+    void configure(const ITensor *input, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEFlattenLayerKernel
+     *
+     * @param[in]  input  First input tensor to flatten with at least 3 dimensions.
+     *                    The dimensions above the third will be interpreted as batches. Data types supported: All
+     * @param[out] output Output tensor with shape [w*h*d, input_batches] where:
+     *                    w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input;
+    ITensor       *_output;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEFLATTENLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEFloorKernel.cpp b/src/core/NEON/kernels/NEFloorKernel.cpp
index 48f964c..2750acd 100644
--- a/src/core/NEON/kernels/NEFloorKernel.cpp
+++ b/src/core/NEON/kernels/NEFloorKernel.cpp
@@ -21,14 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEFloorKernel.h"
+#include "src/core/NEON/kernels/NEFloorKernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Validate.h"
 #include "src/core/CPP/Validate.h"
+#include "src/core/NEON/INEKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
diff --git a/src/core/NEON/kernels/NEFloorKernel.h b/src/core/NEON/kernels/NEFloorKernel.h
new file mode 100644
index 0000000..99c016b
--- /dev/null
+++ b/src/core/NEON/kernels/NEFloorKernel.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEFLOORKERNEL_H
+#define ARM_COMPUTE_NEFLOORKERNEL_H
+
+#include "src/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform a floor operation */
+class NEFloorKernel : public INESimpleKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEFloorKernel";
+    }
+    /** Constructor */
+    NEFloorKernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFloorKernel(const NEFloorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFloorKernel &operator=(const NEFloorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEFloorKernel(NEFloorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEFloorKernel &operator=(NEFloorKernel &&) = default;
+    /** Default destructor */
+    ~NEFloorKernel() = default;
+    /** Set the source, destination of the kernel
+     *
+     * @param[in]  input  Source tensor. Data type supported: F16/F32.
+     * @param[out] output Destination tensor. Same as @p input
+     */
+    void configure(const ITensor *input, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEFloorKernel
+     *
+     * @param[in] input  Source tensor info. Data type supported: F16/F32.
+     * @param[in] output Destination tensor info. Same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEFLOORKERNEL_H */
diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
index e353df1..99f830f 100644
--- a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
+#include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
new file mode 100644
index 0000000..ee767b0
--- /dev/null
+++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEFUSEBATCHNORMALIZATIONKERNEL_H
+#define ARM_COMPUTE_NEFUSEBATCHNORMALIZATIONKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** OpenNE kernel to fuse the batch normalization node to a preceding convolution node */
+class NEFuseBatchNormalizationKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEFuseBatchNormalizationKernel";
+    }
+    /** Default constructor */
+    NEFuseBatchNormalizationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFuseBatchNormalizationKernel(const NEFuseBatchNormalizationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFuseBatchNormalizationKernel &operator=(const NEFuseBatchNormalizationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEFuseBatchNormalizationKernel(NEFuseBatchNormalizationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEFuseBatchNormalizationKernel &operator=(NEFuseBatchNormalizationKernel &&) = default;
+    /** Default destructor */
+    ~NEFuseBatchNormalizationKernel() = default;
+    /** Set the source, destination of the kernel
+     *
+     * @param[in]  input_weights Input weights tensor for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
+     * @param[in]  bn_mean       Batch normalization layer mean tensor. Same as @p input_weights
+     * @param[in]  bn_var        Batch normalization layer variance tensor. Same as @p input_weights
+     * @param[out] fused_weights (Optional) Output fused weights tensor. It can be a nullptr in case of in-place computation. Same as @p input_weights
+     * @param[out] fused_bias    (Optional) Output fused bias tensor. It can be a nullptr in case of in-place computation and input_bias != nullptr. Same as @p input_weights
+     * @param[in]  input_bias    (Optional) Input bias tensor for convolution or depthwise convolution layer. It can be a nullptr in case the bias tensor is not required. Same as @p input_weights
+     * @param[in]  bn_beta       (Optional) Batch normalization layer beta tensor. It can be a nullptr in case the beta tensor is not required. Same as @p input_weights
+     *                           @note if nullptr, bn_beta is set to 0.0
+     * @param[in]  bn_gamma      (Optional) Batch normalization layer gamma tensor. It can be a nullptr in case the gamma tensor is not required. Same as @p input_weights
+     *                           @note if nullptr, bn_gamma is set to 1.0
+     * @param[in]  epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
+     * @param[in]  fbn_type      (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
+     */
+    void configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var, ITensor *fused_weights, ITensor *fused_bias,
+                   const ITensor *input_bias = nullptr, const ITensor *bn_beta = nullptr, const ITensor *bn_gamma = nullptr,
+                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEFuseBatchNormalizationKernel
+     *
+     * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
+     * @param[in] bn_mean       Batch normalization layer mean tensor info. Same as @p input_weights
+     * @param[in] bn_var        Batch normalization layer variance tensor info. Same as @p input_weights
+     * @param[in] fused_weights (Optional) Output fused weights tensor info. It can be a nullptr in case of in-place computation. Same as @p input_weights
+     * @param[in] fused_bias    (Optional) Output fused bias tensor info. It can be a nullptr in case of in-place computation and input_bias != nullptr. Same as @p input_weights
+     * @param[in] input_bias    (Optional) Input bias tensor info for convolution or depthwise convolution layer. It can be a nullptr in case the bias tensor is not required. Same as @p input_weights
+     * @param[in] bn_beta       (Optional) Batch normalization layer beta tensor info. It can be a nullptr in case the beta tensor is not required. Same as @p input_weights
+     *                          @note if nullptr, bn_beta is set to 0.0
+     * @param[in] bn_gamma      (Optional) Batch normalization layer gamma tensor info. It can be a nullptr in case the gamma tensor is not required. Same as @p input_weights
+     *                          @note if nullptr, bn_gamma is set to 1.0
+     * @param[in] epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
+     * @param[in] fbn_type      (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
+                           const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
+                           const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
+                           float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input_weights;
+    const ITensor *_input_bias;
+    const ITensor *_bn_mean;
+    const ITensor *_bn_var;
+    const ITensor *_bn_gamma;
+    const ITensor *_bn_beta;
+    ITensor       *_fused_weights;
+    ITensor       *_fused_bias;
+    float          _epsilon;
+    bool           _run_in_place_weights;
+    bool           _run_in_place_bias;
+
+    using FuseBatchNormFunction = void(const ITensor *input_weights, const ITensor *input_bias, ITensor *fused_weights, ITensor *fused_bias,
+                                       const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window);
+
+    FuseBatchNormFunction *_func;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEFUSEBATCHNORMALIZATIONKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h b/src/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h
new file mode 100644
index 0000000..775a2c0
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEGEMMASSEMBLYBASE_H
+#define ARM_COMPUTE_NEGEMMASSEMBLYBASE_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Base class for GEMM NEON kernels implemented in Assembly. */
+class NEGEMMAssemblyBaseKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEGEMMAssemblyBaseKernel";
+    }
+    /** Constructor */
+    NEGEMMAssemblyBaseKernel()
+        : _input0(nullptr), _input1(nullptr), _output(nullptr), _workspace(nullptr), _alpha(1.f), _beta(0.f), _is_transposed_0(false), _is_transposed_1(false)
+    {
+    }
+
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMAssemblyBaseKernel(const NEGEMMAssemblyBaseKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMAssemblyBaseKernel &operator=(const NEGEMMAssemblyBaseKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMAssemblyBaseKernel(NEGEMMAssemblyBaseKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMAssemblyBaseKernel &operator=(NEGEMMAssemblyBaseKernel &&) = default;
+
+    virtual ~NEGEMMAssemblyBaseKernel() = default;
+
+    /** Initialise the kernel's input and output.
+     *
+     * The computed function is C = a * AxB + b * C.
+     *
+     * @param[in]     input0          Input tensor containing the Matrix A. Data types supported: F32
+     * @param[in]     input1          Input tensor containing the Matrix B. Data types supported: same as @p input0
+     * @param[in,out] output          Output tensor to store the result of matrix multiplication. If @p beta is not zero the values are multiplied by @p beta before the result is accumulated. Otherwise the values are overwritten by the result. Data types supported: same as @p input0.
+     * @param[out]    workspace       Space for intermediate results.
+     * @param[in]     alpha           Weight of the matrix product
+     * @param[in]     beta            Weight of the accumulation.
+     * @param[in]     is_transposed_0 (Optional)True if @p input0 is transposed else false. (Defaults to false)
+     * @param[in]     is_transposed_1 (Optional)True if @p input1 is transposed else false. (Defaults to false)
+     */
+    void configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha = 1.f, float beta = 0.f, bool is_transposed_0 = false, bool is_transposed_1 = false)
+    {
+        internal_configure(input0, input1, output, workspace, alpha, beta, is_transposed_0, is_transposed_1);
+    }
+
+protected:
+    virtual void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool _is_transposed_0, bool _is_transposed_1) = 0;
+
+    const ITensor *_input0;
+    const ITensor *_input1;
+    ITensor       *_output;
+    ITensor       *_workspace;
+    float          _alpha;
+    float          _beta;
+    bool           _is_transposed_0;
+    bool           _is_transposed_1;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEGEMMASSEMBLYBASE_H*/
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index 2997c1d..5d178ea 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
@@ -21,16 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/NEON/INEKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
new file mode 100644
index 0000000..85939eb
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H
+#define ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H
+
+#include "src/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to interleave the elements of a matrix
+ *
+ * This function puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
+ *
+ * @f[
+ * \left( \begin{array}{cccc}
+ * a00 & a01 & a02 & a03 \\
+ * a10 & a11 & a12 & a13 \\
+ * a20 & a21 & a22 & a23 \\
+ * a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccccccccccc}
+ * a00 & a10 & a20 & a30 & a01 & a11 & a21 & a31 & a02 & a12 & a22 & a32 & a03 & a13 & a23 & a33 \\
+ * \end{array} \right)
+ * @f]
+ *
+ * After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
+ */
+class NEGEMMInterleave4x4Kernel : public INESimpleKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEGEMMInterleave4x4Kernel";
+    }
+    /** Constructor */
+    NEGEMMInterleave4x4Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMInterleave4x4Kernel(const NEGEMMInterleave4x4Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMInterleave4x4Kernel &operator=(const NEGEMMInterleave4x4Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMInterleave4x4Kernel(NEGEMMInterleave4x4Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMInterleave4x4Kernel &operator=(NEGEMMInterleave4x4Kernel &&) = default;
+    /** Default destructor */
+    ~NEGEMMInterleave4x4Kernel() = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor. Data types supported: All
+     * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input.
+     */
+    void configure(const ITensor *input, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMInterleave4x4Kernel
+     *
+     * @param[in] input  Input tensor info. Data types supported: All
+     * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Template function to run gemm interleave 4x4
+     *
+     * @tparam ScalarType Scalar datatype
+     *
+     * @param[in]  input  Input tensor. Data types supported: uint32_t, uint16_t and uint8_t
+     * @param[out] output Output tensor. Data types supported: uint32_t, uint16_t and uint8_t
+     * @param[in]  window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename ScalarType>
+    void gemm_interleave4x4(const ITensor *input, ITensor *output, const Window &window);
+
+    /** Common signature for all the specialised gemm interleave 4x4 functions
+     *
+     * @param[in]  input  Input tensor. Data types supported: uint32_t, uint16_t and uint8_t
+     * @param[out] output Output tensor. Data types supported: uint32_t, uint16_t and uint8_t
+     * @param[in]  window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    using GEMMInterleaveFunctionFuncPtr = void (NEGEMMInterleave4x4Kernel::*)(const ITensor *input, ITensor *output, const Window &window);
+
+    GEMMInterleaveFunctionFuncPtr _func;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H*/
diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
index acc5190..4dbfc3b 100644
--- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
new file mode 100644
index 0000000..14d03fe
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H
+#define ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to multiply matrices
+ *
+ * @note @ref NEGEMMLowpMatrixMultiplyKernel low precision matrix product kernel
+ *  This kernel performs the following computation:
+ *
+ *  -# Convert a values from int8 to int32
+ *  -# Convert b values from int8 to int32
+ *  -# Compute the int32 matrix product of the resulting a * b and store the result as int32
+ *
+ */
+class NEGEMMLowpMatrixMultiplyKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEGEMMLowpMatrixMultiplyKernel";
+    }
+    /** Constructor */
+    NEGEMMLowpMatrixMultiplyKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpMatrixMultiplyKernel(const NEGEMMLowpMatrixMultiplyKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpMatrixMultiplyKernel &operator=(const NEGEMMLowpMatrixMultiplyKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpMatrixMultiplyKernel(NEGEMMLowpMatrixMultiplyKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpMatrixMultiplyKernel &operator=(NEGEMMLowpMatrixMultiplyKernel &&) = default;
+    /** Default destructor */
+    ~NEGEMMLowpMatrixMultiplyKernel() = default;
+    /** Initialise the kernel's input and output.
+     *
+     * The input matrices @p input0 and @p input1 must be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel. These two
+     * kernels change the layout of the original matrices to be more cache-friendly.
+     *
+     * @param[in]  input0 Input tensor containing the interleaved Matrix A. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED
+     * @param[in]  input1 Input tensor containing the transposed1xW Matrix B. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
+     * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32
+     */
+    void configure(const ITensor *input0, const ITensor *input1, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyKernel
+     *
+     * @param[in] input0 Input tensor info containing the interleaved Matrix A. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED
+     * @param[in] input1 Input tensor info containing the transposed Matrix B. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
+     * @param[in] output Output tensor info to store the result of matrix multiplication. Data type supported: S32
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input0;
+    const ITensor *_input1;
+    ITensor       *_output;
+    bool           _slide_matrix_b;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H*/
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
index 1c76926..174a069 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
new file mode 100644
index 0000000..0f37e58
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
+#define ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel used to add the offset contribution after @ref NEGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel),
+ * and adds to it the offset contribution of matrix A and matrix B in-place.
+ *
+ * The final result is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (vector_sum_col[k] * a_offset) +
+ *                   (vector_sum_row[i] * b_offset) +
+ *                   (a_offset * b_offset * k)
+ *
+ */
+class NEGEMMLowpOffsetContributionKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEGEMMLowpOffsetContributionKernel";
+    }
+    /** Constructor */
+    NEGEMMLowpOffsetContributionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpOffsetContributionKernel(const NEGEMMLowpOffsetContributionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpOffsetContributionKernel &operator=(const NEGEMMLowpOffsetContributionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpOffsetContributionKernel(NEGEMMLowpOffsetContributionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpOffsetContributionKernel &operator=(NEGEMMLowpOffsetContributionKernel &&) = default;
+    /** Default destructor */
+    ~NEGEMMLowpOffsetContributionKernel() = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in, out] mm_result      Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+     * @param[in]      vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
+     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]      vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
+     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]      k              Number of matrix A columns or Matrix B rows
+     * @param[in]      a_offset       Offset to be added to each element of the matrix A.
+     * @param[in]      b_offset       Offset to be added to each element of the matrix B.
+     */
+    void configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpOffsetContributionKernel
+     *
+     * @param[in] mm_result      Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+     * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
+     *                           Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+     * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
+     *                           Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
+     * @param[in] a_offset       Offset to be added to each element of the matrix A.
+     * @param[in] b_offset       Offset to be added to each element of the matrix B.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, int32_t a_offset, int32_t b_offset);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_vector_sum_col;
+    const ITensor *_vector_sum_row;
+    ITensor       *_mm_result;
+    int32_t        _a_offset;
+    int32_t        _b_offset;
+    int32_t        _k_offset;
+    bool           _slide_vector_sum_col;
+};
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
index 6a7d225..3c8f5ae 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
new file mode 100644
index 0000000..4c68fb0
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
+#define ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel used to add the offset contribution and perform the output stage after @ref NEGEMMLowpMatrixMultiplyKernel.
+ *
+ * The computation is performed in-place
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel),
+ * and adds to it the offset contribution of matrix A and matrix B in-place.
+ *
+ * The output stage can perform either QuantizeDownInt32ToUint8Scale or QuantizeDownInt32ToUint8ScaleByFixedPoint for Uint8.
+ * The output stage can perform either QuantizeDownInt32ToInt8Scale or QuantizeDownInt32ToInt8ScaleByFixedPoint for Int8.
+ *
+ * For QuantizeDownInt32ToUint8Scale/QuantizeDownInt32ToInt8Scale the final result is:
+ *
+ * ((mm_result'[i][k] + result_offset) * result_mult_int) >> result_shift
+ *
+ * For QuantizeDownInt32ToUint8ScaleByFixedPoint/QuantizeDownInt32ToInt8ScaleByFixedPoint the final result is:
+ *
+ * (FixedPointMul(mm_result'[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
+ *
+ * where FixedPointMul(x, y) is the nearest integer to the following
+ * mathematical expression, evaluated without overflow or intermediate rounding:
+ *
+ * (x * y) / 2^31
+ *
+ * and mm_result'[i][k] = mm_result[i][k] +
+ *                        (vector_sum_col[k] * a_offset) +
+ *                        (vector_sum_row[i] * b_offset) +
+ *                        (a_offset * b_offset * k)
+ */
+
+class NEGEMMLowpOffsetContributionOutputStageKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEGEMMLowpOffsetContributionOutputStageKernel";
+    }
+    /** Constructor */
+    NEGEMMLowpOffsetContributionOutputStageKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpOffsetContributionOutputStageKernel(const NEGEMMLowpOffsetContributionOutputStageKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpOffsetContributionOutputStageKernel &operator=(const NEGEMMLowpOffsetContributionOutputStageKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpOffsetContributionOutputStageKernel(NEGEMMLowpOffsetContributionOutputStageKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpOffsetContributionOutputStageKernel &operator=(NEGEMMLowpOffsetContributionOutputStageKernel &&) = default;
+    /** Default destructor */
+    ~NEGEMMLowpOffsetContributionOutputStageKernel() = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  mm_result      Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+     * @param[in]  vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
+     *                            Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]  vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
+     * @param[in]  bias           Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                            Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result.
+     * @param[out] output         Output tensor containing the final quantized result. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  k              Number of matrix A columns or Matrix B rows
+     * @param[in]  a_offset       Offset to be added to each element of the matrix A.
+     * @param[in]  b_offset       Offset to be added to each element of the matrix B.
+     * @param[in]  output_stage   GEMMLowp output stage info, providing the type of quantization and the necessary parameters.
+     */
+    void configure(const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, int32_t k, int32_t a_offset, int32_t b_offset,
+                   GEMMLowpOutputStageInfo output_stage);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpOffsetContributionOutputStageKernel
+     *
+     * @param[in] mm_result      Input tensor info containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+     * @param[in] vector_sum_col Tensor info for the input row-vector of sums of all the entries in each column of matrix B.
+     *                           Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+     * @param[in] vector_sum_row Tensor info for the input row-vector of sums of all the entries in each row of matrix A.
+     *                           Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
+     * @param[in] bias           Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                           Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result.
+     * @param[in] output         Output tensor info containing the final quantized result. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in] a_offset       Offset to be added to each element of the matrix A.
+     * @param[in] b_offset       Offset to be added to each element of the matrix B.
+     * @param[in] output_stage   GEMMLowp output stage info, providing the type of quantization and the necessary parameters.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, int32_t a_offset,
+                           int32_t                 b_offset,
+                           GEMMLowpOutputStageInfo output_stage);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Function to use for the particular tensors passed to configure() */
+    const ITensor          *_vector_sum_col;
+    const ITensor          *_vector_sum_row;
+    const ITensor          *_bias;
+    const ITensor          *_mm_result;
+    ITensor                *_output;
+    int32_t                 _a_offset;
+    int32_t                 _b_offset;
+    int32_t                 _k_offset;
+    bool                    _slide_vector_sum_col;
+    GEMMLowpOutputStageInfo _output_stage;
+};
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp
index 659c410..2e78107 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h
new file mode 100644
index 0000000..42ef570
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H
+#define ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values:
+ *  -#  -to the [0..255] range and cast to QASYMM8.
+ *  -#  -to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ */
+class NEGEMMLowpQuantizeDownInt32ScaleKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEGEMMLowpQuantizeDownInt32ScaleKernel";
+    }
+    /** Constructor */
+    NEGEMMLowpQuantizeDownInt32ScaleKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpQuantizeDownInt32ScaleKernel(const NEGEMMLowpQuantizeDownInt32ScaleKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpQuantizeDownInt32ScaleKernel &operator=(const NEGEMMLowpQuantizeDownInt32ScaleKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpQuantizeDownInt32ScaleKernel(NEGEMMLowpQuantizeDownInt32ScaleKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpQuantizeDownInt32ScaleKernel &operator=(NEGEMMLowpQuantizeDownInt32ScaleKernel &&) = default;
+    /** Default destructor */
+    ~NEGEMMLowpQuantizeDownInt32ScaleKernel() = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input        Input tensor. Data type supported: S32
+     * @param[in]  bias         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output       Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[out] output_stage GEMMLowp output stage metadata.
+     */
+    void configure(const ITensor *input, const ITensor *bias, ITensor *output, const GEMMLowpOutputStageInfo *output_stage);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ScaleKernel
+     *
+     * @param[in]  input        Input tensor. Data type supported: S32
+     * @param[in]  bias         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[in]  output       Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[out] output_stage GEMMLowp output stage metadata.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Template function to run the NEGEMMLowpQuantizeDownInt32ScaleKernel
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename T>
+    void run(const Window &window);
+
+    /** Common signature for all the specialised NEGEMMLowpQuantizeDownInt32ScaleKernel functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using QuantizeDownFunctionPtr = void (NEGEMMLowpQuantizeDownInt32ScaleKernel::*)(const Window &window);
+
+    QuantizeDownFunctionPtr        _func;
+    const ITensor                 *_input;
+    const ITensor                 *_bias;
+    ITensor                       *_output;
+    const GEMMLowpOutputStageInfo *_output_stage;
+    bool                           _is_bounded_relu;
+};
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
index afa8cec..1fafc62 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
new file mode 100644
index 0000000..d04e713
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT16SCALEBYFIXEDPOINTKERNEL_H
+#define ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT16SCALEBYFIXEDPOINTKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QSYMM16 value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the [-32768, 32767] range and cast to QSYMM16.
+ *
+ */
+class NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel";
+    }
+    /** Constructor */
+    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel(const NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &operator=(const NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &operator=(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &&) = default;
+    /** Default destructor */
+    ~NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel() = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input                        Input tensor. Data type supported: S32
+     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output                       Output tensor. Data type supported: Data type supported: QSYMM16
+     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
+     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
+     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to 0.
+     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
+     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0.
+     */
+    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int min = 0, int max = 0);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
+     *
+     * @param[in] input  Input tensor info. Data type supported: S32
+     * @param[in] bias   Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                   Biases are 1D tensor info with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[in] output Output tensor info. Data type supported: Data type supported: QSYMM16
+     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to 0.
+     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QSYMM16,
+     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Template function to run the NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <bool is_bounded_relu>
+    void run(const Window &window);
+
+    /** Common signature for all the specialised NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using QuantizeDownFunctionPtr = void (NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::*)(const Window &window);
+
+    QuantizeDownFunctionPtr _func;
+    const ITensor          *_input;
+    const ITensor          *_bias;
+    ITensor                *_output;
+    int                     _result_fixedpoint_multiplier;
+    int                     _result_shift;
+    int                     _min;
+    int                     _max;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT16SCALEBYFIXEDPOINTKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
index 83416e0..bf9ce95 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
new file mode 100644
index 0000000..55c07fb
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT8SCALEBYFIXEDPOINTKERNEL_H
+#define ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT8SCALEBYFIXEDPOINTKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8_SIGNED value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ */
+class NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel";
+    }
+    /** Constructor */
+    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel(const NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &operator=(const NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel(NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &operator=(NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &&) = default;
+    /** Default destructor */
+    ~NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel() = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input                        Input tensor. Data type supported: S32
+     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
+     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
+     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
+     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8_SIGNED
+     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED
+     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
+     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     */
+    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+     *
+     * @param[in] input  Input tensor. Data type supported: S32
+     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
+     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED
+     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
+     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Template function to run the NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <bool is_bounded_relu>
+    void run(const Window &window);
+
+    /** Common signature for all the specialised NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using QuantizeDownFunctionPtr = void (NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::*)(const Window &window);
+
+    QuantizeDownFunctionPtr _func;
+    const ITensor          *_input;
+    const ITensor          *_bias;
+    ITensor                *_output;
+    int                     _result_fixedpoint_multiplier;
+    int                     _result_shift;
+    int                     _result_offset_after_shift;
+    int                     _min;
+    int                     _max;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT8SCALEBYFIXEDPOINTKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
index 1e8aa0c..cbb56da 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
new file mode 100644
index 0000000..1a8de1c
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H
+#define ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ */
+class NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel";
+    }
+    /** Constructor */
+    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &operator=(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &operator=(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default;
+    /** Default destructor */
+    ~NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel() = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input                        Input tensor. Data type supported: S32
+     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
+     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
+     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8
+     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     */
+    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+     *
+     * @param[in] input  Input tensor. Data type supported: S32
+     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Template function to run the NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <bool is_bounded_relu>
+    void run(const Window &window);
+
+    /** Common signature for all the specialised NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using QuantizeDownFunctionPtr = void (NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::*)(const Window &window);
+
+    QuantizeDownFunctionPtr _func;
+    const ITensor          *_input;
+    const ITensor          *_bias;
+    ITensor                *_output;
+    int                     _result_fixedpoint_multiplier;
+    int                     _result_shift;
+    int                     _result_offset_after_shift;
+    int                     _min;
+    int                     _max;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
index 566872f..db038e5 100644
--- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.h b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.h
new file mode 100644
index 0000000..655658c
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.h
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEGEMMLOWREDUCTIONKERNEL_H
+#define ARM_COMPUTE_NEGEMMLOWREDUCTIONKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+struct GEMMLowpReductionKernelInfo;
+
+/** Common interface for all NEON reduction kernels */
+class INEGEMMLowpReductionKernel : public INEKernel
+{
+public:
+    /** Constructor */
+    INEGEMMLowpReductionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    INEGEMMLowpReductionKernel(const INEGEMMLowpReductionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    INEGEMMLowpReductionKernel &operator=(const INEGEMMLowpReductionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    INEGEMMLowpReductionKernel(INEGEMMLowpReductionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    INEGEMMLowpReductionKernel &operator=(INEGEMMLowpReductionKernel &&) = default;
+    /** Default destructor */
+    virtual ~INEGEMMLowpReductionKernel() = default;
+
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
+     * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
+     * @param[in]  info   Kernel metadata:
+     *                    - k            Number of matrix columns/rows depending on the type of reduction.
+     *                    - is_reshaped  True if the matrix has been reshaped.
+     *                    - scalar       Scalar value to multiply each reduced column/row by.
+     *                    - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
+     */
+    virtual void configure(const ITensor *input, ITensor *output, const GEMMLowpReductionKernelInfo &info) = 0;
+
+protected:
+    const ITensor *_input;
+    ITensor       *_output;
+    int32_t        _k;
+    int32_t        _scalar;
+    bool           _mul_by_scalar;
+};
+
+/** NEON kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ */
+class NEGEMMLowpMatrixAReductionKernel : public INEGEMMLowpReductionKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEGEMMLowpMatrixAReductionKernel";
+    }
+    /** Default constructor */
+    NEGEMMLowpMatrixAReductionKernel() = default;
+    /** Prevent instances of this class from being copied */
+    NEGEMMLowpMatrixAReductionKernel(const NEGEMMLowpMatrixAReductionKernel &) = delete;
+    /** Prevent instances of this class from being copied */
+    NEGEMMLowpMatrixAReductionKernel &operator=(const NEGEMMLowpMatrixAReductionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpMatrixAReductionKernel(NEGEMMLowpMatrixAReductionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpMatrixAReductionKernel &operator=(NEGEMMLowpMatrixAReductionKernel &&) = default;
+    /** Default destructor */
+    ~NEGEMMLowpMatrixAReductionKernel() = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
+     * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
+     * @param[in]  info           Kernel metadata:
+     *                            - k            (num_mtx_a_cols) Number of matrix A columns
+     *                            - is_reshaped  (is_interleaved4x4) True if the matrix A has been interleaved4x4
+     *                            - scalar       Scalar value to multiply each reduced row by.
+     *                            - mul_byscalar True if each reduced column must be multiplied by a scalar value.
+     */
+    void configure(const ITensor *mtx_a, ITensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixAReductionKernel
+     *
+     * @param[in] mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
+     * @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
+     * @param[in] info           Kernel metadata:
+     *                           - k            (num_mtx_a_cols) Number of matrix A columns
+     *                           - is_reshaped  (is_interleaved4x4) True if the matrix A has been interleaved4x4
+     *                           - scalar       Scalar value to multiply each reduced row by.
+     *                           - mul_byscalar True if each reduced column must be multiplied by a scalar value.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Execution of the reduction kernel specialized on the input type
+     *
+     * @param[in] window Execution window
+     */
+    template <typename T>
+    void run_internal(const Window &window);
+};
+
+/** NEON kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ */
+class NEGEMMLowpMatrixBReductionKernel : public INEGEMMLowpReductionKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEGEMMLowpMatrixBReductionKernel";
+    }
+    /** Default constructor */
+    NEGEMMLowpMatrixBReductionKernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMLowpMatrixBReductionKernel(const NEGEMMLowpMatrixBReductionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMLowpMatrixBReductionKernel &operator=(const NEGEMMLowpMatrixBReductionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpMatrixBReductionKernel(NEGEMMLowpMatrixBReductionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpMatrixBReductionKernel &operator=(NEGEMMLowpMatrixBReductionKernel &&) = default;
+    /** Default destructor */
+    ~NEGEMMLowpMatrixBReductionKernel() = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
+     * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
+     * @param[in]  info           Kernel metadata:
+     *                            - k            (num_mtx_b_rows) Number of matrix B rows.
+     *                            - is_reshaped  (is_transposed1xW) True if the input tensor is transposed 1xW.
+     *                            - scalar       Scalar value to multiply each reduced row by.
+     *                            - mul_byscalar True if each reduced row must be multiplied by a scalar value.
+     */
+    void configure(const ITensor *mtx_b, ITensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixBReductionKernel
+     *
+     * @param[in] mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
+     * @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
+     * @param[in] info           Kernel metadata:
+     *                           - k            (num_mtx_b_rows) Number of matrix B rows.
+     *                           - is_reshaped  (is_transposed1xW) True if the input tensor is transposed 1xW.
+     *                           - scalar       Scalar value to multiply each reduced row by.
+     *                           - mul_byscalar True if each reduced row must be multiplied by a scalar value.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Execution of the reduction kernel specialized on the input type
+     *
+     * @param[in] window Execution window
+     * @param[in] info   Thread-related information
+     */
+    template <typename T>
+    void run_internal(const Window &window, const ThreadInfo &info);
+};
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_NEGEMMLOWREDUCTIONKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
index 9aee26c..6a2802a 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
new file mode 100644
index 0000000..4837783
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEGEMMMATRIXADDITIONKERNEL_H
+#define ARM_COMPUTE_NEGEMMMATRIXADDITIONKERNEL_H
+
+#include "src/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
+ *
+ * @note [ MTX_OUT = MTX_0 + beta * MTX_1 ] with MTX_0 and MTX_1 of the same size
+ *
+ * @note This stage is used to finalize the GEMM result and it is computed if and only if beta != 0.0. In case this kernel is used for finalizing GEMM result, we have:
+ *        - MTX_0 = A * B * alpha, where MTX_0 is the output of @ref NEGEMMMatrixMultiplyKernel
+ *        - MTX_1 = C
+ */
+class NEGEMMMatrixAdditionKernel : public INESimpleKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEGEMMMatrixAdditionKernel";
+    }
+    /** Constructor */
+    NEGEMMMatrixAdditionKernel();
+    /** Prevent instances of this class from being copied */
+    NEGEMMMatrixAdditionKernel(const NEGEMMMatrixAdditionKernel &) = delete;
+    /** Prevent instances of this class from being copied */
+    NEGEMMMatrixAdditionKernel &operator=(const NEGEMMMatrixAdditionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMMatrixAdditionKernel(NEGEMMMatrixAdditionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMMatrixAdditionKernel &operator=(NEGEMMMatrixAdditionKernel &&) = default;
+    /** Default destructor */
+    ~NEGEMMMatrixAdditionKernel() = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @note The input and output tensor must have the same dimensions
+     *
+     * @param[in]      input  Input tensor (Matrix C). Data types supported: F16/F32
+     * @param[in, out] output Output tensor. If this kernel is used to finalize the GEMM result, output contains the result obtained by the kernel @ref NEGEMMMatrixMultiplyKernel. Data type supported: the same as @p input.
+     * @param[in]      beta   Weight of matrix C
+     */
+    void configure(const ITensor *input, ITensor *output, float beta);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixAdditionKernel.
+     *
+     * @note The input and output tensor must have the same dimensions
+     *
+     * @param[in] input  Input tensor info (Matrix C). Data types supported: F16/F32
+     * @param[in] output Output tensor info. If this kernel is used to finalize the GEMM result, output contains the result obtained by the kernel @ref NEGEMMMatrixMultiplyKernel. Data type supported: the same as @p input.
+     * @param[in] beta   Weight of matrix C
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float beta);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Common signature for all the matrix addition functions
+     *
+     * @param[in]  input  An input tensor. Data types supported: F16/F32
+     * @param[out] output The output tensor. Data type supported: same as @p input
+     * @param[in]  window Region on which to execute the kernel.
+     * @param[in]  beta   Weight of matrix C
+     */
+    using MatrixAdditionFunction = void(const ITensor *input, ITensor *output, const Window &window, float beta);
+    /** Matrix addition function to use for the particular tensor types passed to configure() */
+    MatrixAdditionFunction *_func;
+    float                   _beta;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEGEMMMATRIXADDITIONKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
index a923689..fc95c08 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
new file mode 100644
index 0000000..1ea948d
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEGEMMMATRIXMULTIPLYKERNEL_H
+#define ARM_COMPUTE_NEGEMMMATRIXMULTIPLYKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to multiply two input matrices "A" and "B". All elements of the output matrix/vector will be multiplied by alpha after the matrix multiplication
+ *
+ * @note If the output tensor is a matrix, the implementation assumes that the input tensors @p input0 and @p input1 are both matrices and reshaped respectively with @ref NEGEMMInterleave4x4Kernel" and @ref NEGEMMTranspose1xWKernel
+ * @note If the output tensor is a vector and the data type is F32, the implementation assumes that the first input tensor @p input0 is a vector and the second input tensor @p input1 a matrix. The implementation also assumes that both tensors have not been reshaped
+ *
+ */
+class NEGEMMMatrixMultiplyKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEGEMMMatrixMultiplyKernel";
+    }
+    /** Constructor */
+    NEGEMMMatrixMultiplyKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMMatrixMultiplyKernel(const NEGEMMMatrixMultiplyKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMMatrixMultiplyKernel &operator=(const NEGEMMMatrixMultiplyKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMMatrixMultiplyKernel(NEGEMMMatrixMultiplyKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMMatrixMultiplyKernel &operator=(NEGEMMMatrixMultiplyKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @note If the output tensor is a matrix, the input matrices @p input0 and @p input1 should be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel
+     *       These two kernels change the layout of the original matrices to be more cache-friendly.
+     *
+     * @param[in]  input0         Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32
+     * @param[in]  input1         Input tensor containing the transposed Matrix B if the first input tensor A is not a vector.
+     *                            If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0
+     * @param[out] output         Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
+     * @param[in]  alpha          Weight of the matrix product
+     * @param[in]  is_interleaved (Optional) True if input0 and input1 have been reshaped respectively using @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel
+     * @param[in]  reshape_info   (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
+     */
+    void configure(const ITensor *input0, const ITensor *input1, ITensor *output, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixMultiplyKernel
+     *
+     * @param[in] input0         Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32
+     * @param[in] input1         Input tensor containing the transposed Matrix B if the first input tensor A is not a vector.
+     *                           If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0
+     * @param[in] output         Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
+     * @param[in] alpha          Weight of the matrix product
+     * @param[in] is_interleaved (Optional) True if input0 and input1 have been reshaped respectively using @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel
+     * @param[in] reshape_info   (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input0;
+    const ITensor *_input1;
+    ITensor       *_output;
+    float          _alpha;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEGEMMMATRIXMULTIPLYKERNEL_H*/
diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
index b9b4fe9..6d9f921 100644
--- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
@@ -21,14 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "src/core/AccessWindowStatic.h"
+#include "src/core/NEON/INEKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
new file mode 100644
index 0000000..7120943
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEGEMMTRANSPOSE1xWKERNEL_H
+#define ARM_COMPUTE_NEGEMMTRANSPOSE1xWKERNEL_H
+
+#include "src/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** NEON kernel which transposes the elements of a matrix in chunks of 1xW, where W is equal to (16 / element size of the tensor)
+ *
+ * Following an example of how the transposition1xW works when the input data is F32
+ *
+ * @f[
+ * \left( \begin{array}{cccc}
+ * a00 & a01 & a02 & a03 \\
+ * a10 & a11 & a12 & a13 \\
+ * a20 & a21 & a22 & a23 \\
+ * a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccccccccccc}
+ * a00 & a01 & a02 & a03 & a10 & a11 & a12 & a13 & a20 & a21 & a22 & a23 & a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * @f]
+ *
+ * Following an example of how the transposition1xW works when the input data type is F16
+ *
+ * @f[
+ * \left( \begin{array}{cccccccc}
+ * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a07 \\
+ * a10 & a11 & a12 & a13 & a14 & a15 & a16 & a17 \\
+ * a20 & a21 & a22 & a23 & a24 & a25 & a26 & a27 \\
+ * a30 & a31 & a32 & a33 & a34 & a35 & a36 & a37 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc}
+ * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a07 & a10 & a11 & a12 & a13 & a14 & a15 & a16 & a17 & a20 & a21 & a22 & a23 & a24 & a25 & a26 & a27 & a30 & a31 & a32 & a33 & a34 & a35 & a36 & a37\\
+ * \end{array} \right)
+ * @f]
+ *
+ * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
+ *
+ */
+class NEGEMMTranspose1xWKernel : public INESimpleKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEGEMMTranspose1xWKernel";
+    }
+    /** Constructor */
+    NEGEMMTranspose1xWKernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMTranspose1xWKernel(const NEGEMMTranspose1xWKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMTranspose1xWKernel &operator=(const NEGEMMTranspose1xWKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMTranspose1xWKernel(NEGEMMTranspose1xWKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMTranspose1xWKernel &operator=(NEGEMMTranspose1xWKernel &&) = default;
+    /** Default destructor */
+    ~NEGEMMTranspose1xWKernel() = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor. Data types supported: All
+     * @param[out] output Output tensor. Data type supported: same as @p input.
+     */
+    void configure(const ITensor *input, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMTranspose1xWKernel
+     *
+     * @param[in] input  Input tensor info. Data types supported: All
+     * @param[in] output Output tensor info. Data type supported: same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEGEMMTRANSPOSE1xWKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGatherKernel.cpp b/src/core/NEON/kernels/NEGatherKernel.cpp
index 193fe98..55ecb88 100644
--- a/src/core/NEON/kernels/NEGatherKernel.cpp
+++ b/src/core/NEON/kernels/NEGatherKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGatherKernel.h"
+#include "src/core/NEON/kernels/NEGatherKernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
diff --git a/src/core/NEON/kernels/NEGatherKernel.h b/src/core/NEON/kernels/NEGatherKernel.h
new file mode 100644
index 0000000..d81e34c
--- /dev/null
+++ b/src/core/NEON/kernels/NEGatherKernel.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_NEGATHERKERNEL_H
+#define ARM_COMPUTE_NEGATHERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Kernel to perform other operation on NEON */
+class NEGatherKernel : public INEKernel
+{
+public:
+    /** Default constructor. */
+    NEGatherKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    NEGatherKernel(const NEGatherKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    NEGatherKernel &operator=(const NEGatherKernel &) = delete;
+    /** Allow instances of this class to be moved. */
+    NEGatherKernel(NEGatherKernel &&) = default;
+    /** Allow instances of this class to be moved. */
+    NEGatherKernel &operator=(NEGatherKernel &&) = default;
+    /** Default detructor */
+    ~NEGatherKernel() = default;
+
+    /** Name of the kernel
+     *
+     * @return Kernel name
+     */
+    const char *name() const override
+    {
+        return "NEGatherKernel";
+    }
+    /** Initialise the kernel's inputs and outputs
+     *
+     * @param[in]  input   Source tensor. Supported tensor rank: up to 4. Data type supported: All
+     * @param[in]  indices Indices tensor. Supported tensor rank: up to 1. Must be one of the following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
+     * @param[out] output  Destination tensor. Data type supported: Same as @p input
+     * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0
+     */
+    void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGatherKernel
+     *
+     * @param[in] input   Source tensor info. Supported tensor rank: up to 4. Data type supported: All
+     * @param[in] indices Indices tensor info. Supported tensor rank: up to 1. Must be one of the following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
+     * @param[in] output  Destination tensor info. Data type supported: Same as @p input
+     * @param[in] axis    (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Implementation of the gather operation for 0 axis.
+     *
+     * For gather on the 0 axis an element by element copy is performed.
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a region of the window returned by window())
+     * @param[in] info   Info about executing thread and CPU.
+     */
+    template <typename U>
+    void gather_0_axis(const Window &window, const ThreadInfo &info);
+
+    /** Implementation of the gather operation.
+     *
+     * For 1<=axis a row-wise copy is taking place.
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a region of the window returned by window())
+     * @param[in] info   Info about executing thread and CPU.
+     */
+    template <typename U>
+    void gather_n_axis(const Window &window, const ThreadInfo &info);
+
+    using kernel_ptr = void (NEGatherKernel::*)(const Window &window, const ThreadInfo &info);
+
+    const ITensor *_input;
+    const ITensor *_indices;
+    int            _axis;
+    ITensor       *_output;
+    kernel_ptr     _func;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEGATHERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp b/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
index 5ff5db7..63b26ab 100644
--- a/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
@@ -21,13 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h"
+#include "src/core/NEON/kernels/NEGaussian3x3Kernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/INEKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
diff --git a/src/core/NEON/kernels/NEGaussian3x3Kernel.h b/src/core/NEON/kernels/NEGaussian3x3Kernel.h
new file mode 100644
index 0000000..8973b48
--- /dev/null
+++ b/src/core/NEON/kernels/NEGaussian3x3Kernel.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEGAUSSIAN3x3KERNEL_H
+#define ARM_COMPUTE_NEGAUSSIAN3x3KERNEL_H
+
+#include "src/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform a Gaussian 3x3 filter */
+class NEGaussian3x3Kernel : public INESimpleKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEGaussian3x3Kernel";
+    }
+    /** Constructor */
+    NEGaussian3x3Kernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussian3x3Kernel(const NEGaussian3x3Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussian3x3Kernel &operator=(const NEGaussian3x3Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGaussian3x3Kernel(NEGaussian3x3Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGaussian3x3Kernel &operator=(NEGaussian3x3Kernel &&) = default;
+    /** Default destructor */
+    ~NEGaussian3x3Kernel() = default;
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8
+     * @param[out] output           Destination tensor. Data type supported: same as @p input
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEGAUSSIAN3x3KERNEL_H */
diff --git a/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp b/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
index 5bb3e76..ab2feb0 100644
--- a/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
+++ b/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
@@ -21,15 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h"
+#include "src/core/NEON/kernels/NEGaussian5x5Kernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/INEKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -112,6 +112,10 @@
     input, output);
 }
 
+NEGaussian5x5VertKernel::NEGaussian5x5VertKernel()
+{
+}
+
 BorderSize NEGaussian5x5VertKernel::border_size() const
 {
     return BorderSize{ 2, 0 };
diff --git a/src/core/NEON/kernels/NEGaussian5x5Kernel.h b/src/core/NEON/kernels/NEGaussian5x5Kernel.h
new file mode 100644
index 0000000..f4bca55
--- /dev/null
+++ b/src/core/NEON/kernels/NEGaussian5x5Kernel.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEGAUSSIAN5x5KERNEL_H
+#define ARM_COMPUTE_NEGAUSSIAN5x5KERNEL_H
+
+#include "src/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform a Gaussian 5x5 filter (horizontal pass) */
+class NEGaussian5x5HorKernel : public INESimpleKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEGaussian5x5HorKernel";
+    }
+    /** Default constructor */
+    NEGaussian5x5HorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussian5x5HorKernel(NEGaussian5x5HorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussian5x5HorKernel &operator=(NEGaussian5x5HorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGaussian5x5HorKernel(NEGaussian5x5HorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGaussian5x5HorKernel &operator=(NEGaussian5x5HorKernel &&) = default;
+    /** Default destructor */
+    ~NEGaussian5x5HorKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output           Destination tensor. Data type supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+private:
+    BorderSize _border_size;
+};
+
+/** NEON kernel to perform a Gaussian 5x5 filter (vertical pass) */
+class NEGaussian5x5VertKernel : public INESimpleKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEGaussian5x5VertKernel";
+    }
+    /** Default constructor */
+    NEGaussian5x5VertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussian5x5VertKernel(NEGaussian5x5VertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussian5x5VertKernel &operator=(NEGaussian5x5VertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGaussian5x5VertKernel(NEGaussian5x5VertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGaussian5x5VertKernel &operator=(NEGaussian5x5VertKernel &&) = default;
+    /** Default destructor */
+    ~NEGaussian5x5VertKernel() = default;
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: S16.
+     * @param[out] output           Destination tensor, Data type supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEGAUSSIAN5x5KERNEL_H */
diff --git a/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp b/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
index 62cf414..49c8e9e 100644
--- a/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
+++ b/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
@@ -21,17 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h"
+#include "src/core/NEON/kernels/NEGaussianPyramidKernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/INEKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
diff --git a/src/core/NEON/kernels/NEGaussianPyramidKernel.h b/src/core/NEON/kernels/NEGaussianPyramidKernel.h
new file mode 100644
index 0000000..e852db2
--- /dev/null
+++ b/src/core/NEON/kernels/NEGaussianPyramidKernel.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H
+#define ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H
+
+#include "src/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform a GaussianPyramid (horizontal pass) */
+class NEGaussianPyramidHorKernel : public INESimpleKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEGaussianPyramidHorKernel";
+    }
+    /** Default constructor */
+    NEGaussianPyramidHorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussianPyramidHorKernel(NEGaussianPyramidHorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussianPyramidHorKernel &operator=(NEGaussianPyramidHorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGaussianPyramidHorKernel(NEGaussianPyramidHorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGaussianPyramidHorKernel &operator=(NEGaussianPyramidHorKernel &&) = default;
+    /** Default destructor */
+    ~NEGaussianPyramidHorKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  input  Source tensor. Data type supported: U8.
+     * @param[out] output Destination tensor. Output should have half the input width. Data type supported: S16.
+     */
+    void configure(const ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+private:
+    int _l2_load_offset;
+};
+
+/** NEON kernel to perform a GaussianPyramid (vertical pass) */
+class NEGaussianPyramidVertKernel : public INESimpleKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEGaussianPyramidVertKernel";
+    }
+    /** Default constructor */
+    NEGaussianPyramidVertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussianPyramidVertKernel(NEGaussianPyramidVertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussianPyramidVertKernel &operator=(NEGaussianPyramidVertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGaussianPyramidVertKernel(NEGaussianPyramidVertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGaussianPyramidVertKernel &operator=(NEGaussianPyramidVertKernel &&) = default;
+    /** Default destructor */
+    ~NEGaussianPyramidVertKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  input  Source tensor. Data type supported: S16.
+     * @param[out] output Destination tensor. Output should have half the input height. Data type supported: U8.
+     */
+    void configure(const ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+private:
+    int _t2_load_offset;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
index 483f204..516a9b6 100644
--- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
+#include "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
new file mode 100644
index 0000000..f6d39e5
--- /dev/null
+++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEGENERATEPROPOSALSLAYERKERNEL_H
+#define ARM_COMPUTE_NEGENERATEPROPOSALSLAYERKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for Compute All Anchors kernel */
+class NEComputeAllAnchorsKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEComputeAllAnchorsKernel";
+    }
+
+    /** Default constructor */
+    NEComputeAllAnchorsKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEComputeAllAnchorsKernel(const NEComputeAllAnchorsKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEComputeAllAnchorsKernel &operator=(const NEComputeAllAnchorsKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEComputeAllAnchorsKernel(NEComputeAllAnchorsKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEComputeAllAnchorsKernel &operator=(NEComputeAllAnchorsKernel &&) = default;
+    /** Default destructor */
+    ~NEComputeAllAnchorsKernel() = default;
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  anchors     Source tensor. Original set of anchors of size (4, A), where A is the number of anchors. Data types supported: QSYMM16/F16/F32
+     * @param[out] all_anchors Destination tensor. Destination anchors of size (4, H*W*A) where H and W are the height and width of the feature map and A is the number of anchors. Data types supported: Same as @p input
+     * @param[in]  info        Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
+     *
+     */
+    void configure(const ITensor *anchors, ITensor *all_anchors, const ComputeAnchorsInfo &info);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEComputeAllAnchorsKernel
+     *
+     * @param[in] anchors     Source tensor info. Original set of anchors of size (4, A), where A is the number of anchors. Data types supported: QSYMM16/F16/F32
+     * @param[in] all_anchors Destination tensor info. Destination anchors of size (4, H*W*A) where H and W are the height and width of the feature map and A is the number of anchors. Data types supported: Same as @p input
+     * @param[in] info        Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
+     *
+     * @return a Status
+     */
+    static Status validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    template <typename T>
+    void internal_run(const Window &window);
+
+    const ITensor     *_anchors;
+    ITensor           *_all_anchors;
+    ComputeAnchorsInfo _anchors_info;
+};
+} // arm_compute
+#endif // ARM_COMPUTE_NEGENERATEPROPOSALSLAYERKERNEL_H
diff --git a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
index 00f4087..089cd34 100644
--- a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
+++ b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h"
+#include "src/core/NEON/kernels/NEHOGDescriptorKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/HOGInfo.h"
diff --git a/src/core/NEON/kernels/NEHOGDescriptorKernel.h b/src/core/NEON/kernels/NEHOGDescriptorKernel.h
new file mode 100644
index 0000000..7845bc2
--- /dev/null
+++ b/src/core/NEON/kernels/NEHOGDescriptorKernel.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEHOGDESCRIPTORKERNEL_H
+#define ARM_COMPUTE_NEHOGDESCRIPTORKERNEL_H
+
+#include "arm_compute/core/IHOG.h"
+#include "arm_compute/core/Size2D.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform HOG Orientation Binning */
+class NEHOGOrientationBinningKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEHOGOrientationBinningKernel";
+    }
+    /** Default constructor */
+    NEHOGOrientationBinningKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGOrientationBinningKernel(const NEHOGOrientationBinningKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGOrientationBinningKernel &operator=(const NEHOGOrientationBinningKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEHOGOrientationBinningKernel(NEHOGOrientationBinningKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEHOGOrientationBinningKernel &operator=(NEHOGOrientationBinningKernel &&) = default;
+    /** Default destructor */
+    ~NEHOGOrientationBinningKernel() = default;
+
+    /**  Initialise the kernel's inputs, output and HOG's metadata
+     *
+     * @param[in]  input_magnitude Input tensor which stores the magnitude of the gradient for each pixel. Data type supported: S16.
+     * @param[in]  input_phase     Input tensor which stores the phase of the gradient for each pixel. Data type supported: U8
+     * @param[out] output          Output tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
+     * @param[in]  hog_info        HOG's metadata
+     */
+    void configure(const ITensor *input_magnitude, const ITensor *input_phase, ITensor *output, const HOGInfo *hog_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Common signature for all the specialised block normalization functions
+     *
+     * @param[in]  mag_row_ptr   Pointer to the first row of the cell in the magnitude tensor
+     * @param[in]  phase_row_ptr Pointer to the first row of the cell in the phase tensor
+     * @param[out] output_ptr    Pointer to the output cell of hog space tensor
+     * @param[in]  mag_stride    Stride of the magnitude tensor
+     * @param[in]  phase_stride  Stride of the phase tensor
+     * @param[in]  cell_width    Width of the cell
+     * @param[in]  cell_height   Height of the cell
+     * @param[in]  num_bins      Number of bins for each cell
+     * @param[in]  phase_scale   Scale factor to apply to the phase in order to calculate the histogram index
+     */
+    using OrientBinFunc = void(const int16_t *__restrict mag_row_ptr, const uint8_t *__restrict phase_row_ptr, float *__restrict output_ptr, size_t mag_stride, size_t phase_stride, size_t cell_width,
+                               size_t cell_height, size_t num_bins, float phase_scale);
+    /** Orientation binning function to use for the particular cell width passed to configure() */
+    OrientBinFunc *_func;
+    const ITensor *_input_magnitude;
+    const ITensor *_input_phase;
+    ITensor       *_output;
+    size_t         _cell_width;
+    size_t         _cell_height;
+    size_t         _num_bins;
+    float          _phase_scale;
+};
+
+/** NEON kernel to perform HOG block normalization */
+class NEHOGBlockNormalizationKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEHOGBlockNormalizationKernel";
+    }
+    /** Default constructor */
+    NEHOGBlockNormalizationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGBlockNormalizationKernel(const NEHOGBlockNormalizationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGBlockNormalizationKernel &operator=(const NEHOGBlockNormalizationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEHOGBlockNormalizationKernel(NEHOGBlockNormalizationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEHOGBlockNormalizationKernel &operator=(NEHOGBlockNormalizationKernel &&) = default;
+    /** Default destructor */
+    ~NEHOGBlockNormalizationKernel() = default;
+
+    /** Initialise the kernel's input, output and HOG's metadata
+     *
+     * @param[in]  input    Input tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
+     * @param[out] output   Output tensor which stores the normalised blocks. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
+     * @param[in]  hog_info HOG's metadata
+     */
+    void configure(const ITensor *input, ITensor *output, const HOGInfo *hog_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Common signature for all the specialised block normalization functions
+     *
+     * @param[in]  input_row_ptr              Pointer to the first row of the block in the input hog space tensor
+     * @param[out] output_ptr                 Pointer to the output block of the hog normalized space
+     * @param[in]  input_stride               Stride of the input hog space tensor
+     * @param[in]  num_cells_per_block_height Number of cells per block along the Y direction
+     * @param[in]  num_bins_block_x           Number of bins per block along the X direction
+     * @param[in]  num_bins_block             Number of total bins per block
+     * @param[in]  l2_hyst_threshold          Threshold to use for l2 hysteresis normalization
+     */
+    using BlockNormFunc = void(const float *input_row_ptr, float *output_ptr, size_t input_stride, size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block,
+                               float l2_hyst_threshold);
+    /** Block normalization function to use for the particular normalization type passed to configure() */
+    BlockNormFunc *_func;
+    const ITensor *_input;
+    ITensor       *_output;
+    Size2D         _num_cells_per_block;
+    Size2D         _num_cells_per_block_stride;
+    size_t         _num_bins;
+    float          _l2_hyst_threshold;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEHOGDESCRIPTORKERNEL_H */
diff --git a/src/core/NEON/kernels/NEHOGDetectorKernel.cpp b/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
index d5dfa41..cba1d55 100644
--- a/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
+++ b/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h"
+#include "src/core/NEON/kernels/NEHOGDetectorKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/HOGInfo.h"
diff --git a/src/core/NEON/kernels/NEHOGDetectorKernel.h b/src/core/NEON/kernels/NEHOGDetectorKernel.h
new file mode 100644
index 0000000..45c2809
--- /dev/null
+++ b/src/core/NEON/kernels/NEHOGDetectorKernel.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEHOGDETECTORKERNEL_H
+#define ARM_COMPUTE_NEHOGDETECTORKERNEL_H
+
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/IHOG.h"
+#include "src/core/NEON/INEKernel.h"
+#include "support/Mutex.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform HOG detector kernel using linear SVM */
+class NEHOGDetectorKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEHOGDetectorKernel";
+    }
+    /** Default constructor */
+    NEHOGDetectorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGDetectorKernel(const NEHOGDetectorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGDetectorKernel &operator=(const NEHOGDetectorKernel &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEHOGDetectorKernel(NEHOGDetectorKernel &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEHOGDetectorKernel &operator=(NEHOGDetectorKernel &&) = delete;
+    /** Default destructor */
+    ~NEHOGDetectorKernel() = default;
+
+    /** Initialise the kernel's input, HOG data-object, detection window, the stride of the detection window, the threshold and index of the object to detect
+     *
+     * @param[in]  input                   Input tensor which stores the HOG descriptor obtained with @ref NEHOGOrientationBinningKernel. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
+     * @param[in]  hog                     HOG data object used by @ref NEHOGOrientationBinningKernel and  @ref NEHOGBlockNormalizationKernel
+     * @param[out] detection_windows       Array of @ref DetectionWindow. This array stores all the detected objects
+     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
+     *                                     It must be multiple of the hog->info()->block_stride()
+     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
+     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
+     */
+    void configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold = 0.0f, uint16_t idx_class = 0);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor         *_input;
+    IDetectionWindowArray *_detection_windows;
+    const float           *_hog_descriptor;
+    float                  _bias;
+    float                  _threshold;
+    uint16_t               _idx_class;
+    size_t                 _num_bins_per_descriptor_x;
+    size_t                 _num_blocks_per_descriptor_y;
+    size_t                 _block_stride_width;
+    size_t                 _block_stride_height;
+    size_t                 _detection_window_width;
+    size_t                 _detection_window_height;
+    size_t                 _max_num_detection_windows;
+    arm_compute::Mutex     _mutex;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEHOGDETECTORKERNEL_H */
diff --git a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
index be68b9c..4159e43 100644
--- a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
+++ b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
+#include "src/core/NEON/kernels/NEHarrisCornersKernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
diff --git a/src/core/NEON/kernels/NEHarrisCornersKernel.h b/src/core/NEON/kernels/NEHarrisCornersKernel.h
new file mode 100644
index 0000000..4b79410
--- /dev/null
+++ b/src/core/NEON/kernels/NEHarrisCornersKernel.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEHARRISCORNERSKERNEL_H
+#define ARM_COMPUTE_NEHARRISCORNERSKERNEL_H
+
+#include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h"
+#include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h"
+#include "arm_compute/core/IArray.h"
+#include "src/core/NEON/INEKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** Common interface for all Harris Score kernels */
+class INEHarrisScoreKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    INEHarrisScoreKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INEHarrisScoreKernel(const INEHarrisScoreKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INEHarrisScoreKernel &operator=(const INEHarrisScoreKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    INEHarrisScoreKernel(INEHarrisScoreKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    INEHarrisScoreKernel &operator=(INEHarrisScoreKernel &&) = default;
+    /** Default destructor */
+    ~INEHarrisScoreKernel() = default;
+
+public:
+    /** Setup the kernel parameters
+     *
+     * @param[in]  input1           Source image (gradient X). Data types supported: S16/S32
+     * @param[in]  input2           Source image (gradient Y). Data types supported: same as @ input1
+     * @param[out] output           Destination image (harris score). Data types supported: F32
+     * @param[in]  norm_factor      Normalization factor to use accordingly with the gradient size (Must be different from 0)
+     * @param[in]  strength_thresh  Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
+     * @param[in]  sensitivity      Sensitivity threshold k from the Harris-Stephens equation
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    virtual void configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity, bool border_undefined) = 0;
+
+protected:
+    const IImage *_input1;          /**< Source image - Gx component */
+    const IImage *_input2;          /**< Source image - Gy component */
+    IImage       *_output;          /**< Source image - Harris score */
+    float         _sensitivity;     /**< Sensitivity value */
+    float         _strength_thresh; /**< Threshold value */
+    float         _norm_factor;     /**< Normalization factor */
+    BorderSize    _border_size;     /**< Border size */
+};
+
+/** Template NEON kernel to perform Harris Score.
+ *  The implementation supports 3, 5, and 7 for the block_size
+ */
+template <int32_t block_size>
+class NEHarrisScoreKernel : public INEHarrisScoreKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEHarrisScoreKernel";
+    }
+    /** Default constructor */
+    NEHarrisScoreKernel();
+    // Inherited methods overridden:
+    void configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity, bool border_undefined) override;
+    BorderSize border_size() const override;
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Common signature for all the specialised harris score functions */
+    using HarrisScoreFunction = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
+                                     float norm_factor, float sensitivity, float strength_thresh);
+    /** Harris Score function to use for the particular image types passed to configure() */
+    HarrisScoreFunction *_func;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEHARRISCORNERSKERNEL_H */
diff --git a/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp
index a507125..227013a 100644
--- a/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h"
+#include "src/core/NEON/kernels/NEHeightConcatenateLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.h b/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.h
new file mode 100644
index 0000000..9d100eb
--- /dev/null
+++ b/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_NEHEIGHTCONCATENATELAYERKERNEL_H
+#define ARM_COMPUTE_NEHEIGHTCONCATENATELAYERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Interface for the height concatenate kernel.
+ *  The input tensor will be concatenated into the output tensor.
+ */
+class NEHeightConcatenateLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEHeightConcatenateLayerKernel";
+    }
+    /** Default constructor */
+    NEHeightConcatenateLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHeightConcatenateLayerKernel(const NEHeightConcatenateLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHeightConcatenateLayerKernel &operator=(const NEHeightConcatenateLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEHeightConcatenateLayerKernel(NEHeightConcatenateLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEHeightConcatenateLayerKernel &operator=(NEHeightConcatenateLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEHeightConcatenateLayerKernel() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]     input         Input tensor info. Data types supported: All
+     * @param[in]     height_offset The starting offset on the Y axis for the output tensor.
+     * @param[in,out] output        Output tensor info. Data types supported: Same as @p input.
+     *
+     */
+    void configure(const ITensorInfo *input, unsigned int height_offset, ITensorInfo *output);
+    /**  Static function to check if given info will lead to a valid configuration of @ref NEHeightConcatenateLayerKernel
+     *
+     * @param[in] input         Input tensor info. Data types supported: All
+     * @param[in] height_offset The starting offset on the Y axis for the output tensor.
+     * @param[in] output        Output tensor info. Data types supported: Same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+
+private:
+    unsigned int _height_offset;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEHEIGHTCONCATENATELAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEHistogramKernel.cpp b/src/core/NEON/kernels/NEHistogramKernel.cpp
index 12d1bb8..eddc3b2 100644
--- a/src/core/NEON/kernels/NEHistogramKernel.cpp
+++ b/src/core/NEON/kernels/NEHistogramKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEHistogramKernel.h"
+#include "src/core/NEON/kernels/NEHistogramKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEHistogramKernel.h b/src/core/NEON/kernels/NEHistogramKernel.h
new file mode 100644
index 0000000..e14519c
--- /dev/null
+++ b/src/core/NEON/kernels/NEHistogramKernel.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEHISTOGRAMKERNEL_H
+#define ARM_COMPUTE_NEHISTOGRAMKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+#include "support/Mutex.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+class IDistribution1D;
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the histogram kernel */
+class NEHistogramKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEHistogramKernel";
+    }
+    /** Default constructor */
+    NEHistogramKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHistogramKernel(const NEHistogramKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHistogramKernel &operator=(const NEHistogramKernel &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEHistogramKernel(NEHistogramKernel &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEHistogramKernel &operator=(NEHistogramKernel &&) = delete;
+    /** Default destructor */
+    ~NEHistogramKernel() = default;
+
+    /** Set the input image and the distribution output.
+     *
+     * @param[in]     input      Source image. Data type supported: U8.
+     * @param[out]    output     Destination distribution.
+     * @param[in,out] local_hist Array that the threads use to save their local histograms.
+     *                           It's size should be equal to (number_of_threads * num_bins),
+     *                           and the Window::thread_id() is used to determine the part of the array
+     *                           used by each thread.
+     * @param[out]    window_lut LUT with pre-calculated possible window values.
+     *                           The size of the LUT should be equal to max_range_size and it will be filled
+     *                           during the configure stage, while it re-used in every run, therefore can be
+     *                           safely shared among threads.
+     */
+    void configure(const IImage *input, IDistribution1D *output, uint32_t *local_hist, uint32_t *window_lut);
+    /** Set the input image and the distribution output.
+     *
+     * @note Used for histogram of fixed size equal to 256
+     *
+     * @param[in]  input  Source image. Data type supported: U8.
+     * @param[out] output Destination distribution which must be of 256 bins..
+     */
+    void configure(const IImage *input, IDistribution1D *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Function to merge multiple partial histograms.
+     *
+     * @param[out] global_hist Pointer to the final histogram.
+     * @param[in]  local_hist  Pointer to the partial histograms.
+     * @param[in]  bins        Number of bins.
+     */
+    void merge_histogram(uint32_t *global_hist, const uint32_t *local_hist, size_t bins);
+    /** Function to merge multiple minimum values of partial histograms.
+     *
+     * @param[out] global_min Pointer to the global min value.
+     * @param[in]  local_min  Local min value.
+     */
+    void merge_min(uint8_t *global_min, const uint8_t &local_min);
+    /** Function to perform histogram on the given window
+     *
+     * @param[in] win  Region on which to execute the kernel
+     * @param[in] info Info about the executing thread
+     */
+    void histogram_U8(Window win, const ThreadInfo &info);
+    /** Function to perform histogram on the given window where histogram is
+     *         of fixed size 256 without ranges and offsets.
+     *
+     * @param[in] win  Region on which to execute the kernel
+     * @param[in] info Info about the executing thread
+     */
+    void histogram_fixed_U8(Window win, const ThreadInfo &info);
+    /** Pre-calculate the pixel windowing for every possible pixel
+     *
+     * Calculate (V - offset) * numBins / range where V is every possible pixel value.
+     *
+     * @note We currently support U8 image thus possible pixel values are between 0 and 255
+     */
+    void calculate_window_lut() const;
+    /** Common signature for all the specialised Histogram functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using HistogramFunctionPtr = void (NEHistogramKernel::*)(Window window, const ThreadInfo &info);
+
+    HistogramFunctionPtr          _func; ///< Histogram function to use for the particular image types passed to configure()
+    const IImage                 *_input;
+    IDistribution1D              *_output;
+    uint32_t                     *_local_hist;
+    uint32_t                     *_window_lut;
+    arm_compute::Mutex            _hist_mtx;
+    static constexpr unsigned int _max_range_size{ 256 }; ///< 256 possible pixel values as we handle only U8 images
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEHISTOGRAMKERNEL_H */
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 915ea75..93bfcc5 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
+#include "src/core/NEON/kernels/NEIm2ColKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.h b/src/core/NEON/kernels/NEIm2ColKernel.h
new file mode 100644
index 0000000..6c1c631
--- /dev/null
+++ b/src/core/NEON/kernels/NEIm2ColKernel.h
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEIM2COLKERNEL_H
+#define ARM_COMPUTE_NEIM2COLKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+class Size2D;
+
+/** Interface for the im2col reshape kernel.
+ *
+ * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column.
+ * It is used to transform a convolution to a plain matrix multiplication.
+ *
+ * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have:
+ *
+ * @f[
+ * \left( \begin{array}{cccc}
+ * a00 & a01 & a02 & a03 \\
+ * a10 & a11 & a12 & a13 \\
+ * a20 & a21 & a22 & a23 \\
+ * a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccc}
+ * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\
+ * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\
+ * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\
+ * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * @f]
+ */
+class NEIm2ColKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEIm2ColKernel";
+    }
+    /** Default constructor */
+    NEIm2ColKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEIm2ColKernel(const NEIm2ColKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEIm2ColKernel &operator=(const NEIm2ColKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEIm2ColKernel(NEIm2ColKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEIm2ColKernel &operator=(NEIm2ColKernel &&) = default;
+    /** Default destructor */
+    ~NEIm2ColKernel() = default;
+
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
+     *                         while every optional dimension from 4 and above represent a batch of inputs.
+     *                         Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32
+     *                         Note: QASYMM8/QASYMM8_SIGNED works only for has_bias = false
+     * @param[out] output      The output tensor. Data types supported: Same as @p input
+     * @param[in]  kernel_dims The kernel dimensions (width and height).
+     * @param[in]  conv_info   Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  has_bias    In case biases are provided expands the matrix with 1.
+     * @param[in]  dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  num_groups  (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
+     */
+    void configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
+                   bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEIm2ColKernel
+     *
+     * @param[in] input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
+     *                        while every optional dimension from 4 and above represent a batch of inputs.
+     *                        Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32
+     *                        Note: QASYMM8/QASYMM8_SIGNED works only for has_bias = false
+     * @param[in] output      The output tensor. Data types supported: Same as @p input
+     * @param[in] kernel_dims The kernel dimensions (width and height).
+     * @param[in] conv_info   Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in] has_bias    In case biases are provided expands the matrix with 1.
+     * @param[in] dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in] num_groups  (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
+                           bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Template function to run im2col
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename T, bool has_pads, bool is_nchw>
+    void run_im2col(const Window &window);
+
+    /** Common signature for all the specialised im2col functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using Im2ColFunctionPtr = void (NEIm2ColKernel::*)(const Window &window);
+
+    Im2ColFunctionPtr _func;
+    const ITensor    *_input;
+    ITensor          *_output;
+    std::pair<unsigned int, unsigned int> _convolved_dims;
+    PadStrideInfo _conv_info;
+    unsigned int  _kernel_width;
+    unsigned int  _kernel_height;
+    bool          _has_bias;
+    Size2D        _dilation;
+    DataLayout    _data_layout;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEIM2COLKERNEL_H */
diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
index 7aa23de..08bf6f0 100644
--- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
+#include "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
new file mode 100644
index 0000000..96c0119
--- /dev/null
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H
+#define ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+struct InstanceNormalizationLayerKernelInfo;
+
+/** Interface for performing an instance normalization */
+class NEInstanceNormalizationLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEInstanceNormalizationLayerKernel";
+    }
+    /** Default constructor */
+    NEInstanceNormalizationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEInstanceNormalizationLayerKernel(const NEInstanceNormalizationLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEInstanceNormalizationLayerKernel &operator=(const NEInstanceNormalizationLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEInstanceNormalizationLayerKernel(NEInstanceNormalizationLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEInstanceNormalizationLayerKernel &operator=(NEInstanceNormalizationLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEInstanceNormalizationLayerKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in, out] input  Source tensor. Data types supported: F16/F32. Data layout supported: NCHW
+     *                        In case of @p output tensor = nullptr this tensor will store the result of the normalization.
+     * @param[out]     output Destination tensor. Data types and data layouts supported: same as @p input.
+     * @param[in]      info   Kernel meta-data descriptor
+     */
+    void configure(ITensor *input, ITensor *output, const InstanceNormalizationLayerKernelInfo &info);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEInstanceNormalizationLayer.
+     *
+     * @param[in] input  Source tensor info. Data types supported: F16/F32. Data layout supported: NCHW
+     * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p input.
+     * @param[in] info   Kernel meta-data descriptor
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Common signature for all the specialized instance normalization functions
+     *
+     * @param[in, out] input   An input tensor. In case of @p output tensor = nullptr this tensor will store the result of the normalization.
+     * @param[out]     output  The output tensor.
+     * @param[in]      gamma   The scale scalar value applied to the normalized tensor. Defaults to 1.0
+     * @param[in]      beta    The offset scalar value applied to the normalized tensor. Defaults to 0.0
+     * @param[in]      epsilon Lower bound value for the normalization. Defaults to 1e-12
+     */
+    using NormalizationFunction = void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
+
+    NormalizationFunction *_func;
+    ITensor               *_input;
+    ITensor               *_output;
+    float                  _gamma;
+    float                  _beta;
+    float                  _epsilon;
+    bool                   _use_mixed_precision{ true };
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEIntegralImageKernel.cpp b/src/core/NEON/kernels/NEIntegralImageKernel.cpp
index 5fc6ca6..6ee97ee 100644
--- a/src/core/NEON/kernels/NEIntegralImageKernel.cpp
+++ b/src/core/NEON/kernels/NEIntegralImageKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEIntegralImageKernel.h"
+#include "src/core/NEON/kernels/NEIntegralImageKernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEIntegralImageKernel.h b/src/core/NEON/kernels/NEIntegralImageKernel.h
new file mode 100644
index 0000000..8d92504
--- /dev/null
+++ b/src/core/NEON/kernels/NEIntegralImageKernel.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEINTEGRALIMAGEKERNEL_H
+#define ARM_COMPUTE_NEINTEGRALIMAGEKERNEL_H
+
+#include "src/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Kernel to perform an image integral on an image */
+class NEIntegralImageKernel : public INESimpleKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEIntegralImageKernel";
+    }
+    /** Default constructor */
+    NEIntegralImageKernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEIntegralImageKernel(const NEIntegralImageKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEIntegralImageKernel &operator=(const NEIntegralImageKernel &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEIntegralImageKernel(NEIntegralImageKernel &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEIntegralImageKernel &operator=(NEIntegralImageKernel &&) = delete;
+    /** Default destructor */
+    ~NEIntegralImageKernel() = default;
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input  Source tensor. Data type supported: U8
+     * @param[out] output Destination tensor. Data type supported: U32
+     */
+    void configure(const ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+    bool       is_parallelisable() const override;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEINTEGRALIMAGEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
index a216981..dae5b57 100644
--- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
+#include "src/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h
new file mode 100644
index 0000000..af3ad34
--- /dev/null
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEL2NORMALIZELAYERKERNEL_H
+#define ARM_COMPUTE_NEL2NORMALIZELAYERKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for performing a L2 normalize on a given axis given the square sum of it in this axis */
+class NEL2NormalizeLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEL2NormalizeLayerKernel";
+    }
+    /** Default constructor */
+    NEL2NormalizeLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEL2NormalizeLayerKernel(const NEL2NormalizeLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEL2NormalizeLayerKernel &operator=(const NEL2NormalizeLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEL2NormalizeLayerKernel(NEL2NormalizeLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEL2NormalizeLayerKernel &operator=(NEL2NormalizeLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEL2NormalizeLayerKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input   Source tensor. Data types supported: F16/F32.
+     * @param[in]  sum     Sum values tensor. Data types supported: same as @p input.
+     *                     Sum will have the same number of dimensions as input.
+     * @param[out] output  Destination tensor. Data types and data layouts supported: same as @p input.
+     *                     Output will have the same number of dimensions as input.
+     * @param[in]  axis    Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
+     * @param[in]  epsilon Lower bound value for the normalization.
+     */
+    void configure(const ITensor *input, const ITensor *sum, ITensor *output, int axis, float epsilon);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEL2NormalizeLayerKernel.
+     *
+     * @param[in] input   Source tensor info. Data types supported: F16/F32.
+     * @param[in] sum     Sum values tensor info. Data types supported: same as @p input.
+     *                    Sum will have the same number of dimensions as input.
+     * @param[in] output  Destination tensor info. Data types and data layouts supported: same as @p input.
+     *                    Output will have the same number of dimensions as input.
+     * @param[in] axis    Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
+     * @param[in] epsilon Lower bound value for the normalization.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input;
+    const ITensor *_sum;
+    ITensor       *_output;
+    unsigned int   _actual_axis;
+    float          _epsilon;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEL2NORMALIZELAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NELKTrackerKernel.cpp b/src/core/NEON/kernels/NELKTrackerKernel.cpp
index 6567a8d..442f001 100644
--- a/src/core/NEON/kernels/NELKTrackerKernel.cpp
+++ b/src/core/NEON/kernels/NELKTrackerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
+#include "src/core/NEON/kernels/NELKTrackerKernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
diff --git a/src/core/NEON/kernels/NELKTrackerKernel.h b/src/core/NEON/kernels/NELKTrackerKernel.h
new file mode 100644
index 0000000..c24166c
--- /dev/null
+++ b/src/core/NEON/kernels/NELKTrackerKernel.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_LKTRACKERKERNEL_H
+#define ARM_COMPUTE_LKTRACKERKERNEL_H
+
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+#include <utility>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for NEON Array of Internal Key Points. */
+using INELKInternalKeypointArray = IArray<NELKInternalKeypoint>;
+
+/** Interface for the Lucas-Kanade tracker kernel */
+class NELKTrackerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NELKTrackerKernel";
+    }
+    /** Default constructor */
+    NELKTrackerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELKTrackerKernel(const NELKTrackerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELKTrackerKernel &operator=(const NELKTrackerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NELKTrackerKernel(NELKTrackerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NELKTrackerKernel &operator=(NELKTrackerKernel &&) = default;
+    /** Default destructor */
+    ~NELKTrackerKernel() = default;
+
+    /** Initialise the kernel input and output
+     *
+     * @param[in]      input_old            Pointer to the input old tensor. Data type supported: U8
+     * @param[in]      input_new            Pointer to the input new tensor. Data type supported. U8
+     * @param[in]      old_scharr_gx        Pointer to the input scharr X tensor. Data type supported: S16
+     * @param[in]      old_scharr_gy        Pointer to the input scharr Y tensor. Data type supported: S16
+     * @param[in]      old_points           Pointer to the IKeyPointArray storing old key points
+     * @param[in]      new_points_estimates Pointer to the IKeyPointArray storing new estimates key points
+     * @param[out]     new_points           Pointer to the IKeyPointArray storing new key points
+     * @param[in, out] old_points_internal  Pointer to the array of NELKInternalKeypoint for old points
+     * @param[out]     new_points_internal  Pointer to the array of NELKInternalKeypoint for new points
+     * @param[in]      termination          The criteria to terminate the search of each keypoint.
+     * @param[in]      use_initial_estimate The flag to indicate whether the initial estimated position should be used
+     * @param[in]      epsilon              The error for terminating the algorithm
+     * @param[in]      num_iterations       The maximum number of iterations before terminate the algorithm
+     * @param[in]      window_dimension     The size of the window on which to perform the algorithm
+     * @param[in]      level                The pyramid level
+     * @param[in]      num_levels           The number of pyramid levels
+     * @param[in]      pyramid_scale        Scale factor used for generating the pyramid
+     */
+    void configure(const ITensor *input_old, const ITensor *input_new, const ITensor *old_scharr_gx, const ITensor *old_scharr_gy,
+                   const IKeyPointArray *old_points, const IKeyPointArray *new_points_estimates, IKeyPointArray *new_points,
+                   INELKInternalKeypointArray *old_points_internal, INELKInternalKeypointArray *new_points_internal,
+                   Termination termination, bool use_initial_estimate, float epsilon, unsigned int num_iterations, size_t window_dimension,
+                   size_t level, size_t num_levels, float pyramid_scale);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Initialise the array of keypoints in the provide range
+     *
+     * @param[in] start Index of first element in the keypoints array to be initialised
+     * @param[in] end   Index after last elelemnt in the keypoints array to be initialised
+     */
+    void init_keypoints(int start, int end);
+    /** Compute the structure tensor A^T * A based on the scharr gradients I_x and I_y
+     *
+     * @param[in]  keypoint    Keypoint for which gradients are computed
+     * @param[out] bilinear_ix Intermediate interpolated data for X gradient
+     * @param[out] bilinear_iy Intermediate interpolated data for Y gradient
+     *
+     * @return Values A11, A12, A22
+     */
+    std::tuple<int, int, int> compute_spatial_gradient_matrix(const NELKInternalKeypoint &keypoint, int32_t *bilinear_ix, int32_t *bilinear_iy);
+    /** Compute the vector A^T * b, i.e. -sum(I_d * I_t) for d in {x,y}
+     *
+     * @param[in] old_keypoint Old keypoint for which gradient is computed
+     * @param[in] new_keypoint New keypoint for which gradient is computed
+     * @param[in] bilinear_ix  Intermediate interpolated data for X gradient
+     * @param[in] bilinear_iy  Intermediate interpolated data for Y gradient
+     *
+     * @return Values b1, b2
+     */
+    std::pair<int, int> compute_image_mismatch_vector(const NELKInternalKeypoint &old_keypoint, const NELKInternalKeypoint &new_keypoint, const int32_t *bilinear_ix, const int32_t *bilinear_iy);
+
+    const ITensor              *_input_old;
+    const ITensor              *_input_new;
+    const ITensor              *_old_scharr_gx;
+    const ITensor              *_old_scharr_gy;
+    IKeyPointArray             *_new_points;
+    const IKeyPointArray       *_new_points_estimates;
+    const IKeyPointArray       *_old_points;
+    INELKInternalKeypointArray *_old_points_internal;
+    INELKInternalKeypointArray *_new_points_internal;
+    Termination                 _termination;
+    bool                        _use_initial_estimate;
+    float                       _pyramid_scale;
+    float                       _epsilon;
+    unsigned int                _num_iterations;
+    int                         _window_dimension;
+    unsigned int                _level;
+    unsigned int                _num_levels;
+    ValidRegion                 _valid_region;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NELKTRACKERKERNEL_H */
diff --git a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
index b8e6a6d..f11694d 100644
--- a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h
new file mode 100644
index 0000000..72093b4
--- /dev/null
+++ b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NELOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H
+#define ARM_COMPUTE_NELOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to multiply each row of first tensor with low 2 dimensions of second tensor. */
+class NELocallyConnectedMatrixMultiplyKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NELocallyConnectedMatrixMultiplyKernel";
+    }
+    /** Default constructor */
+    NELocallyConnectedMatrixMultiplyKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELocallyConnectedMatrixMultiplyKernel(const NELocallyConnectedMatrixMultiplyKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELocallyConnectedMatrixMultiplyKernel &operator=(const NELocallyConnectedMatrixMultiplyKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NELocallyConnectedMatrixMultiplyKernel(NELocallyConnectedMatrixMultiplyKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NELocallyConnectedMatrixMultiplyKernel &operator=(NELocallyConnectedMatrixMultiplyKernel &&) = default;
+    /** Default destructor */
+    ~NELocallyConnectedMatrixMultiplyKernel() = default;
+    /** Initialise the kernel's input and output
+     *
+     * @param[in]  input0 First input tensor. Data types supported: F16, F32
+     * @param[in]  input1 Second input tensor containing the Matrix B. Data type supported: same as @p input0
+     * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
+     */
+    void configure(const ITensor *input0, const ITensor *input1, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NELocallyConnectedMatrixMultiplyKernel
+     *
+     * @param[in] input0 First input tensor info. Data types supported: F16, F32
+     * @param[in] input1 Second input tensor info. Data type supported: same as @p input0
+     * @param[in] output Output tensor info. Data type supported: same as @p input0
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input0;
+    const ITensor *_input1;
+    ITensor       *_output;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NELOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H */
diff --git a/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp b/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
index 8d82e1a..205f678 100644
--- a/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
+++ b/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
+#include "src/core/NEON/kernels/NEMagnitudePhaseKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEMagnitudePhaseKernel.h b/src/core/NEON/kernels/NEMagnitudePhaseKernel.h
new file mode 100644
index 0000000..3803d05
--- /dev/null
+++ b/src/core/NEON/kernels/NEMagnitudePhaseKernel.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H
+#define ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Template interface for the kernel to compute magnitude and phase */
+template <MagnitudeType mag_type, PhaseType phase_type>
+class NEMagnitudePhaseKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEMagnitudePhaseKernel";
+    }
+    /** Default constructor */
+    NEMagnitudePhaseKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMagnitudePhaseKernel(const NEMagnitudePhaseKernel &) = delete;
+    /** Default move constructor */
+    NEMagnitudePhaseKernel(NEMagnitudePhaseKernel &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMagnitudePhaseKernel &operator=(const NEMagnitudePhaseKernel &) = delete;
+    /** Default move assignment operator */
+    NEMagnitudePhaseKernel &operator=(NEMagnitudePhaseKernel &&) = default;
+    /** Destructor */
+    ~NEMagnitudePhaseKernel() = default;
+
+    /** Initialise the kernel's input, output.
+     *
+     * @note At least one of out1 or out2 must be set
+     *
+     * @param[in]  gx        Gradient X tensor. Data type supported: S16.
+     * @param[in]  gy        Gradient Y tensor. Data type supported: S16.
+     * @param[out] magnitude (Optional) The output tensor - Magnitude. Data type supported: S16.
+     * @param[out] phase     (Optional) The output tensor - Phase. Data type supported: U8.
+     */
+    void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Function to perform magnitude on the given window
+     *
+     * @param[in] window Region on which to execute the kernel
+     */
+    void magnitude(const Window &window);
+    /** Function to perform phase on the given window
+     *
+     * @param[in] window Region on which to execute the kernel
+     */
+    void phase(const Window &window);
+    /** Function to perform magnitude and phase on the given window
+     *
+     * @param[in] window Region on which to execute the kernel
+     */
+    void magnitude_phase(const Window &window);
+
+private:
+    /** Common signature for all the specialised MagnitudePhase functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using MagnitudePhaseFunctionPtr = void (NEMagnitudePhaseKernel::*)(const Window &window);
+    /** MagnitudePhase function to use for the particular formats passed to configure() */
+    MagnitudePhaseFunctionPtr _func;
+    const ITensor            *_gx;        /**< Input gradient X */
+    const ITensor            *_gy;        /**< Input gradient Y */
+    ITensor                  *_magnitude; /**< Output - Magnitude */
+    ITensor                  *_phase;     /**< Output - Phase */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp
index 87caf00..761fa15 100644
--- a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h"
+#include "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h"
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
diff --git a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
new file mode 100644
index 0000000..8cdfe2b
--- /dev/null
+++ b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEMAXUNPOOLINGLAYERKERNEL_H
+#define ARM_COMPUTE_NEMAXUNPOOLINGLAYERKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the pooling layer kernel */
+class NEMaxUnpoolingLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEMaxUnpoolingLayerKernel";
+    }
+    /** Default constructor */
+    NEMaxUnpoolingLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMaxUnpoolingLayerKernel(const NEMaxUnpoolingLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMaxUnpoolingLayerKernel &operator=(const NEMaxUnpoolingLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEMaxUnpoolingLayerKernel(NEMaxUnpoolingLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEMaxUnpoolingLayerKernel &operator=(NEMaxUnpoolingLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEMaxUnpoolingLayerKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @note Output shape must be equal to the shape of the original input to pool.
+     *
+     * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  indices   Tensor containing the offset to store the input elements in the output tensor.
+     *                       @ref NEPoolingLayerKernel with indices should precede this function in order to
+     *                       properly reconstruct the output tensor.
+     *                       The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
+     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     */
+    void configure(const ITensor *input, const ITensor *indices, ITensor *output, const PoolingLayerInfo &pool_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEMaxUnpoolingLayerKernel
+     *
+     * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
+     * @param[in] indices   Tensor info of the indices of the maximal values. Data type supported: U32.
+     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Function to perform 2x2 pooling and compute the pooling indices. The indices can be used for max unpool.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     */
+    template <typename T>
+    void unpooling2(const Window &window_input);
+
+    using UnpoolingFunction = void (NEMaxUnpoolingLayerKernel::*)(const Window &window);
+
+private:
+    UnpoolingFunction _func;
+    const ITensor    *_input;
+    ITensor          *_output;
+    const ITensor    *_indices;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEMAXUNPOOLINGLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEMeanStdDevKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
index c4e036a..a6bb9f2 100644
--- a/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
+++ b/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h"
+#include "src/core/NEON/kernels/NEMeanStdDevKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEMeanStdDevKernel.h b/src/core/NEON/kernels/NEMeanStdDevKernel.h
new file mode 100644
index 0000000..e694f38
--- /dev/null
+++ b/src/core/NEON/kernels/NEMeanStdDevKernel.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEMEANSTDDEVKERNEL_H
+#define ARM_COMPUTE_NEMEANSTDDEVKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+#include "support/Mutex.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the kernel to calculate mean and standard deviation of input image pixels. */
+class NEMeanStdDevKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEMeanStdDevKernel";
+    }
+    /** Default constructor */
+    NEMeanStdDevKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMeanStdDevKernel(const NEMeanStdDevKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMeanStdDevKernel &operator=(const NEMeanStdDevKernel &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEMeanStdDevKernel(NEMeanStdDevKernel &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEMeanStdDevKernel &operator=(NEMeanStdDevKernel &&) = delete;
+    /** Default destructor */
+    ~NEMeanStdDevKernel() = default;
+
+    /** Initialise the kernel's input and outputs.
+     *
+     * @param[in]  input              Input image. Data type supported: U8.
+     * @param[out] mean               Input average pixel value.
+     * @param[out] global_sum         Keeps global sum of pixel values.
+     * @param[out] stddev             (Optional) Output standard deviation of pixel values.
+     * @param[out] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values.
+     */
+    void configure(const IImage *input, float *mean, uint64_t *global_sum, float *stddev = nullptr, uint64_t *global_sum_squared = nullptr);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+    BorderSize border_size() const override;
+
+private:
+    const IImage      *_input;
+    float             *_mean;
+    float             *_stddev;
+    uint64_t          *_global_sum;
+    uint64_t          *_global_sum_squared;
+    arm_compute::Mutex _mtx;
+    BorderSize         _border_size;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEMEANSTDDEVKERNEL_H */
diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
index 8ee9ff6..6a41e3a 100644
--- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
+#include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h
new file mode 100644
index 0000000..59d073a
--- /dev/null
+++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEMEANSTDDEVNORMALIZATIONKERNEL_H
+#define ARM_COMPUTE_NEMEANSTDDEVNORMALIZATIONKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#include <arm_fp16.h>
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to normalize the input 2D tensor across the first dimension with respect to mean and standard deviation of the same dimension. */
+class NEMeanStdDevNormalizationKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEMeanStdDevNormalizationKernel";
+    }
+    /** Default constructor */
+    NEMeanStdDevNormalizationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMeanStdDevNormalizationKernel(const NEMeanStdDevNormalizationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMeanStdDevNormalizationKernel &operator=(const NEMeanStdDevNormalizationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEMeanStdDevNormalizationKernel(NEMeanStdDevNormalizationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEMeanStdDevNormalizationKernel &operator=(NEMeanStdDevNormalizationKernel &&) = default;
+    /** Default destructor */
+    ~NEMeanStdDevNormalizationKernel() = default;
+    /** Initialise the kernel's input and outputs.
+     *
+     * @note If the output tensor is a nullptr, the normalization will be performed in-place.
+     *
+     * @param[in, out] input   Source tensor with 2 dimensions. In case of @p output tensor = nullptr,
+     *                         this tensor will store the result of the normalization. Data types supported: F16/F32.
+     * @param[out]     output  (Optional) Destination tensor. It can be nullptr in case of in-place computation. Data type supported: same as @p input
+     * @param[in]      epsilon (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
+     */
+    void configure(ITensor *input, ITensor *output = nullptr, float epsilon = 1e-8f);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEMeanStdDevNormalizationKernel
+     *
+     * @param[in] input   Source tensor info with 2 dimensions. In case of @p output tensor info = nullptr,
+     *                    this tensor will store the result of the normalization. Data types supported: F16/F32.
+     * @param[in] output  (Optional) Destination tensor info. It can be nullptr in case of in-place computation. Data type supported: same as @p input
+     * @param[in] epsilon (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output = nullptr, float epsilon = 1e-8f);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Normalizes the input with respect to mean and standard deviation.
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    template <typename ScalarType, int size>
+    void mean_stddev_normalization(const Window &window);
+
+    ITensor *_input;
+    ITensor *_output;
+    float    _epsilon;
+
+    using MeanStdDevNormFunction = void (NEMeanStdDevNormalizationKernel::*)(const Window &window);
+
+    MeanStdDevNormFunction _func;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEMEANSTDDEVNORMALIZATIONKERNEL_H */
diff --git a/src/core/NEON/kernels/NEMedian3x3Kernel.cpp b/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
index 86fcc30..0160edc 100644
--- a/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
@@ -21,14 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h"
+#include "src/core/NEON/kernels/NEMedian3x3Kernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/INEKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
diff --git a/src/core/NEON/kernels/NEMedian3x3Kernel.h b/src/core/NEON/kernels/NEMedian3x3Kernel.h
new file mode 100644
index 0000000..b9e28b3
--- /dev/null
+++ b/src/core/NEON/kernels/NEMedian3x3Kernel.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEMEDIAN3x3KERNEL_H
+#define ARM_COMPUTE_NEMEDIAN3x3KERNEL_H
+
+#include "src/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Kernel to perform a median filter on a tensor */
+class NEMedian3x3Kernel : public INESimpleKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEMedian3x3Kernel";
+    }
+    /** Default constructor */
+    NEMedian3x3Kernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMedian3x3Kernel(const NEMedian3x3Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMedian3x3Kernel &operator=(const NEMedian3x3Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEMedian3x3Kernel(NEMedian3x3Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEMedian3x3Kernel &operator=(NEMedian3x3Kernel &&) = default;
+    /** Default destructor */
+    ~NEMedian3x3Kernel() = default;
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8
+     * @param[out] output           Destination tensor. Data type supported: U8
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEMEDIAN3x3KERNEL_H */
diff --git a/src/core/NEON/kernels/NEMemsetKernel.cpp b/src/core/NEON/kernels/NEMemsetKernel.cpp
index fd427cc..a8dfda3 100644
--- a/src/core/NEON/kernels/NEMemsetKernel.cpp
+++ b/src/core/NEON/kernels/NEMemsetKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEMemsetKernel.h"
+#include "src/core/NEON/kernels/NEMemsetKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
diff --git a/src/core/NEON/kernels/NEMemsetKernel.h b/src/core/NEON/kernels/NEMemsetKernel.h
new file mode 100644
index 0000000..a720e60
--- /dev/null
+++ b/src/core/NEON/kernels/NEMemsetKernel.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEMEMSETKERNEL_H
+#define ARM_COMPUTE_NEMEMSETKERNEL_H
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Interface for filling the planes of a tensor */
+class NEMemsetKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEMemsetKernel";
+    }
+    /** Default constructor */
+    NEMemsetKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMemsetKernel(const NEMemsetKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMemsetKernel &operator=(const NEMemsetKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEMemsetKernel(NEMemsetKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEMemsetKernel &operator=(NEMemsetKernel &&) = default;
+    /** Default destructor */
+    ~NEMemsetKernel() = default;
+    /** Initialise the kernel's tensor and filling value
+     *
+     * @param[in,out] tensor         Input tensor to fill. Supported data types: All
+     * @param[in]     constant_value The value used to fill the planes of the tensor
+     */
+    void configure(ITensor *tensor, const PixelValue &constant_value);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    ITensor   *_tensor;
+    PixelValue _constant_value;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEMEMSETKERNEL_H */
diff --git a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
index f675c39..92f6b4a 100644
--- a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEMinMaxLayerKernel.h"
+#include "src/core/NEON/kernels/NEMinMaxLayerKernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
diff --git a/src/core/NEON/kernels/NEMinMaxLayerKernel.h b/src/core/NEON/kernels/NEMinMaxLayerKernel.h
new file mode 100644
index 0000000..b4852ad
--- /dev/null
+++ b/src/core/NEON/kernels/NEMinMaxLayerKernel.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_NEMINMAXLAYERKERNEL_H
+#define ARM_COMPUTE_NEMINMAXLAYERKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+#include "support/Mutex.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform min max search on a 3D tensor. */
+class NEMinMaxLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEMinMaxLayerKernel";
+    }
+    /** Default constructor */
+    NEMinMaxLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMinMaxLayerKernel(const NEMinMaxLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMinMaxLayerKernel &operator=(const NEMinMaxLayerKernel &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEMinMaxLayerKernel(NEMinMaxLayerKernel &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEMinMaxLayerKernel &operator=(NEMinMaxLayerKernel &&) = delete;
+    /** Default destructor */
+    ~NEMinMaxLayerKernel() = default;
+
+    /** Initialise the kernel's input and outputs.
+     *
+     * @note output[0] = minimum
+     * @note output[1] = maximum
+     *
+     * @param[in]  input  Input tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches. Data type supported: F32.
+     * @param[out] output Output tensor with shape [2, batches, ...] which stores the minimum and maximum value for each 3D input tensor.
+     *                    The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32
+     */
+    void configure(const ITensor *input, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLMinMaxLayerKernel
+     *
+     * @param[in] input  Input tensor info.  Data types supported: F32.
+     * @param[in] output Output tensor info with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor.
+     *                   The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+    /** Resets global minimum and maximum. */
+    void reset();
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    void update_min_max(float *out_ptr, float min, float max);
+    const ITensor     *_input;
+    ITensor           *_output;
+    arm_compute::Mutex _mtx;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEMINMAXLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
index e1691dc..402e6f1 100644
--- a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
+++ b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h"
+#include "src/core/NEON/kernels/NEMinMaxLocationKernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
diff --git a/src/core/NEON/kernels/NEMinMaxLocationKernel.h b/src/core/NEON/kernels/NEMinMaxLocationKernel.h
new file mode 100644
index 0000000..a246660
--- /dev/null
+++ b/src/core/NEON/kernels/NEMinMaxLocationKernel.h
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEMINMAXLOCATIONKERNEL_H
+#define ARM_COMPUTE_NEMINMAXLOCATIONKERNEL_H
+
+#include "arm_compute/core/IArray.h"
+#include "src/core/NEON/INEKernel.h"
+#include "support/Mutex.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the kernel to perform min max search on an image. */
+class NEMinMaxKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEMinMaxKernel";
+    }
+    /** Default constructor */
+    NEMinMaxKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMinMaxKernel(const NEMinMaxKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMinMaxKernel &operator=(const NEMinMaxKernel &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEMinMaxKernel(NEMinMaxKernel &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEMinMaxKernel &operator=(NEMinMaxKernel &&) = delete;
+    /** Default destructor */
+    ~NEMinMaxKernel() = default;
+
+    /** Initialise the kernel's input and outputs.
+     *
+     * @param[in]  input Input Image. Data types supported: U8/S16/F32.
+     * @param[out] min   Minimum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
+     * @param[out] max   Maximum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
+     */
+    void configure(const IImage *input, void *min, void *max);
+    /** Resets global minimum and maximum. */
+    void reset();
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Performs the min/max algorithm on U8 images on a given window.
+     *
+     * @param win The window to run the algorithm on.
+     */
+    void minmax_U8(Window win);
+    /** Performs the min/max algorithm on S16 images on a given window.
+     *
+     * @param win The window to run the algorithm on.
+     */
+    void minmax_S16(Window win);
+    /** Performs the min/max algorithm on F32 images on a given window.
+     *
+     * @param win The window to run the algorithm on.
+     */
+    void minmax_F32(Window win);
+    /** Common signature for all the specialised MinMax functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using MinMaxFunction = void (NEMinMaxKernel::*)(Window window);
+    /** MinMax function to use for the particular image types passed to configure() */
+    MinMaxFunction _func;
+    /** Helper to update min/max values **/
+    template <typename T>
+    void update_min_max(T min, T max);
+
+    const IImage      *_input; /**< Input image. */
+    void              *_min;   /**< Minimum value. */
+    void              *_max;   /**< Maximum value. */
+    arm_compute::Mutex _mtx;   /**< Mutex used for result reduction. */
+};
+
+/** Interface for the kernel to find min max locations of an image. */
+class NEMinMaxLocationKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEMinMaxLocationKernel";
+    }
+    /** Default constructor */
+    NEMinMaxLocationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMinMaxLocationKernel(const NEMinMaxLocationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMinMaxLocationKernel &operator=(const NEMinMaxLocationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEMinMaxLocationKernel(NEMinMaxLocationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEMinMaxLocationKernel &operator=(NEMinMaxLocationKernel &&) = default;
+    /** Default destructor */
+    ~NEMinMaxLocationKernel() = default;
+
+    /** Initialise the kernel's input and outputs.
+     *
+     * @param[in]  input     Input Image. Data types supported: U8/S16/F32.
+     * @param[out] min       Minimum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
+     * @param[out] max       Maximum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
+     * @param[out] min_loc   Array of minimum value locations.
+     * @param[out] max_loc   Array of maximum value locations.
+     * @param[out] min_count Number of minimum value encounters.
+     * @param[out] max_count Number of maximum value encounters.
+     */
+    void configure(const IImage *input, void *min, void *max,
+                   ICoordinates2DArray *min_loc = nullptr, ICoordinates2DArray *max_loc = nullptr,
+                   uint32_t *min_count = nullptr, uint32_t *max_count = nullptr);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    bool is_parallelisable() const override;
+
+private:
+    /** Performs the min/max location algorithm on T type images on a given window.
+     *
+     * @param win The window to run the algorithm on.
+     */
+    template <class T, bool count_min, bool count_max, bool loc_min, bool loc_max>
+    void minmax_loc(const Window &win);
+    /** Common signature for all the specialised MinMaxLoc functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using MinMaxLocFunction = void (NEMinMaxLocationKernel::*)(const Window &window);
+    /** MinMaxLoc function to use for the particular image types passed to configure() */
+    MinMaxLocFunction _func;
+    /** Helper to create a function pointer table for the parameterized MinMaxLocation functions. */
+    template <class T, typename>
+    struct create_func_table;
+
+    const IImage        *_input;     /**< Input image. */
+    void                *_min;       /**< Minimum value. */
+    void                *_max;       /**< Maximum value. */
+    uint32_t            *_min_count; /**< Count of minimum value encounters. */
+    uint32_t            *_max_count; /**< Count of maximum value encounters. */
+    ICoordinates2DArray *_min_loc;   /**< Locations of minimum values. */
+    ICoordinates2DArray *_max_loc;   /**< Locations of maximum values. */
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEMINMAXLOCATIONKERNEL_H */
diff --git a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
index 31919ea..58c0acd 100644
--- a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
+++ b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h"
+#include "src/core/NEON/kernels/NENonLinearFilterKernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
diff --git a/src/core/NEON/kernels/NENonLinearFilterKernel.h b/src/core/NEON/kernels/NENonLinearFilterKernel.h
new file mode 100644
index 0000000..3cef12e
--- /dev/null
+++ b/src/core/NEON/kernels/NENonLinearFilterKernel.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NENONLINEARFILTERKERNEL_H
+#define ARM_COMPUTE_NENONLINEARFILTERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to apply a non-linear filter */
+class NENonLinearFilterKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NENonLinearFilterKernel";
+    }
+    /** Default constructor */
+    NENonLinearFilterKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENonLinearFilterKernel(NENonLinearFilterKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENonLinearFilterKernel &operator=(NENonLinearFilterKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NENonLinearFilterKernel(NENonLinearFilterKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NENonLinearFilterKernel &operator=(NENonLinearFilterKernel &&) = default;
+    /** Default destructor */
+    ~NENonLinearFilterKernel() = default;
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8
+     * @param[out] output           Destination tensor. Data type supported: U8
+     * @param[in]  function         Non linear function to perform
+     * @param[in]  mask_size        Mask size. Supported sizes: 3, 5
+     * @param[in]  pattern          Mask pattern
+     * @param[in]  mask             The given mask. Will be used only if pattern is specified to PATTERN_OTHER
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Fill mask with the corresponding given pattern.
+     *
+     * @param[in,out] mask    Mask to be filled according to pattern
+     * @param[in]     cols    Columns (width) of mask
+     * @param[in]     rows    Rows (height) of mask
+     * @param[in]     pattern Pattern to fill the mask according to
+     */
+    void fill_mask(uint8_t *mask, int cols, int rows, MatrixPattern pattern);
+    /** Apply a median filter when given mask pattern is defined as box.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void median_filter_box(const Window &win);
+    /** Apply a min filter when given mask pattern is defined as box.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void min_filter_box(const Window &win);
+    /** Apply a max filter when given mask pattern is defined as box.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void max_filter_box(const Window &win);
+    /** Apply a median filter when given mask pattern is defined as cross.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void median_filter_cross(const Window &win);
+    /** Apply a min filter when given mask pattern is defined as cross.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void min_filter_cross(const Window &win);
+    /** Apply a max filter when given mask pattern is defined as cross.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void max_filter_cross(const Window &win);
+    /** Apply a median filter when given mask pattern is defined as disk.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void median_filter_disk(const Window &win);
+    /** Apply a min filter when given mask pattern is defined as disk.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void min_filter_disk(const Window &win);
+    /** Apply a max filter when given mask pattern is defined as disk.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void max_filter_disk(const Window &win);
+    /** Apply a non-linear filter when given mask has user-defined pattern.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void non_linear_filter_generic(const Window &win);
+
+private:
+    unsigned int            _border_width;
+    const ITensor          *_input;
+    ITensor                *_output;
+    const uint8_t          *_mask;
+    MatrixPattern           _pattern;
+    NonLinearFilterFunction _function;
+    unsigned int            _func_idx;
+    BorderSize              _border_size;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NENONLINEARFILTERKERNEL_H */
diff --git a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
index 9566ced..9f5dfcd 100644
--- a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
+#include "src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
new file mode 100644
index 0000000..d32dfec
--- /dev/null
+++ b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H
+#define ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface to perform Non-Maxima suppression over a 3x3 window using NEON
+ *
+ * @note Used by @ref NEFastCorners and @ref NEHarrisCorners
+ */
+class NENonMaximaSuppression3x3Kernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NENonMaximaSuppression3x3Kernel";
+    }
+    /** Default constructor */
+    NENonMaximaSuppression3x3Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENonMaximaSuppression3x3Kernel(const NENonMaximaSuppression3x3Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENonMaximaSuppression3x3Kernel &operator=(const NENonMaximaSuppression3x3Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NENonMaximaSuppression3x3Kernel(NENonMaximaSuppression3x3Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NENonMaximaSuppression3x3Kernel &operator=(NENonMaximaSuppression3x3Kernel &&) = default;
+    /** Default destructor */
+    ~NENonMaximaSuppression3x3Kernel() = default;
+
+    /** Initialise the kernel's sources, destinations and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8/F32
+     * @param[out] output           Destination tensor. Data types supported: same as @p input
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+protected:
+    /** Common signature for all the specialised non-maxima suppression 3x3 functions
+     *
+     * @param[in]  input_ptr    Pointer to the input tensor.
+     * @param[out] output_ptr   Pointer to the output tensor
+     * @param[in]  input_stride Stride of the input tensor
+     */
+    using NonMaxSuppr3x3Function = void(const void *__restrict input_ptr, void *__restrict output_ptr, const uint32_t input_stride);
+
+    NonMaxSuppr3x3Function *_func;   /**< Non-Maxima suppression function to use for the particular tensor types passed to configure() */
+    const ITensor          *_input;  /**< Source tensor */
+    ITensor                *_output; /**< Destination tensor */
+};
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+/** NEON kernel to perform Non-Maxima suppression 3x3 with intermediate results in FP16 if the input data type is FP32
+ */
+class NENonMaximaSuppression3x3FP16Kernel : public NENonMaximaSuppression3x3Kernel
+{
+public:
+    const char *name() const override
+    {
+        return "NENonMaximaSuppression3x3FP16Kernel";
+    }
+    /** Initialise the kernel's sources, destinations and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8/F32.
+     * @param[out] output           Destination tensor. Data types supported: same as @p input
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+};
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+/** NEON kernel to perform Non-Maxima suppression 3x3 with intermediate results in FP16 if the input data type is FP32 */
+using NENonMaximaSuppression3x3FP16Kernel = NENonMaximaSuppression3x3Kernel;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+} // namespace arm_compute
+#endif /* _ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H */
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index 1b72a3e..27464d5 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h"
+#include "src/core/NEON/kernels/NENormalizationLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.h b/src/core/NEON/kernels/NENormalizationLayerKernel.h
new file mode 100644
index 0000000..53a06b9
--- /dev/null
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H
+#define ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the normalization layer kernel.
+ */
+class NENormalizationLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NENormalizationLayerKernel";
+    }
+    /** Default constructor */
+    NENormalizationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENormalizationLayerKernel(const NENormalizationLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENormalizationLayerKernel &operator=(const NENormalizationLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    NENormalizationLayerKernel(NENormalizationLayerKernel &&) = default;
+    /** Default move assignment operator */
+    NENormalizationLayerKernel &operator=(NENormalizationLayerKernel &&) = default;
+    /** Default destructor */
+    ~NENormalizationLayerKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                           and an optional 4th dimension for batch of inputs. Data types supported: FP16/F32. Data layouts supported: NCHW/NHWC.
+     * @param[in]  input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                           Data type and layout supported: same as @p input.
+     * @param[out] output        Destination tensor. Output will have the same number of dimensions as input. Data type and layout supported: same as @p input.
+     * @param[in]  norm_info     Normalization layer information like the normalization type, normalization size and other parameters.
+     */
+    void configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NENormalizationLayerKernel
+     *
+     * @param[in] input         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                          and an optional 4th dimension for batch of inputs. Data types supported: FP16/F32. Data layouts supported: NCHW/NHWC.
+     * @param[in] input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                          Data type and layout supported: same as @p input.
+     * @param[in] output        Destination tensor. Output will have the same number of dimensions as input. Data type and layout supported: same as @p input.
+     * @param[in] norm_info     Normalization layer information like the normalization type, normalization size and other parameters.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, NormalizationLayerInfo norm_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Function to perform normalization depending on the given template
+     *  dimension. The second template parameter specifies whether the
+     *  normalization has to be 1D or 2D.
+     *
+     * @note Only supported normalizations are:
+     *  - 1D over X or Z
+     *  - 2D over X and Y
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    template <typename T, unsigned int S, unsigned int dim, bool do_2D_norm>
+    void normalize_float(const Window &window);
+
+    /** Common signature for all the specialised normalization functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using NormalizationFunction = void (NENormalizationLayerKernel::*)(const Window &window);
+
+private:
+    NormalizationFunction  _func;
+    const ITensor         *_input;
+    const ITensor         *_input_squared;
+    ITensor               *_output;
+    NormalizationLayerInfo _norm_info;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEPadLayerKernel.cpp b/src/core/NEON/kernels/NEPadLayerKernel.cpp
index ca9c541..200fe2c 100644
--- a/src/core/NEON/kernels/NEPadLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPadLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEPadLayerKernel.h"
+#include "src/core/NEON/kernels/NEPadLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEPadLayerKernel.h b/src/core/NEON/kernels/NEPadLayerKernel.h
new file mode 100644
index 0000000..ec4bdff
--- /dev/null
+++ b/src/core/NEON/kernels/NEPadLayerKernel.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEPADLAYERKERNEL_H
+#define ARM_COMPUTE_NEPADLAYERKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to add padding to a tensor
+ *
+ * Add padding given padding information
+ */
+class NEPadLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEPadLayerKernel";
+    }
+    /** Default constructor */
+    NEPadLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPadLayerKernel(const NEPadLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPadLayerKernel &operator=(const NEPadLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEPadLayerKernel(NEPadLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEPadLayerKernel &operator=(NEPadLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEPadLayerKernel() = default;
+
+    /** Initialize the function
+     *
+     * @param[in]  input          Source tensor. Data types supported: All.
+     * @param[out] output         Output tensor. Data type supported: same as @p input
+     * @param[in]  padding        The padding for each spatial dimension of the input tensor. The pair padding[i]
+     *                            specifies the front and the end padding in the i-th dimension.
+     * @param[in]  constant_value (Optional) Constant value to be used for the padding
+     * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT.
+     *                           Only CONSTANT padding mode is currently supported
+     */
+    void configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
+    /**  Static function to check if given info will lead to a valid configuration of @ref NEPadLayer.
+     *
+     * @param[in] input          Source tensor info. Data types supported: All.
+     * @param[in] output         Output tensor info. Data type supported: same as @p input
+     * @param[in] padding        The padding for each spatial dimension of the input tensor. The pair padding[i]
+     *                           specifies the front and the end padding in the i-th dimension.
+     * @param[in] constant_value (Optional) Constant value to be used for the padding
+     * @param[in] mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT.
+     *                           Only CONSTANT padding mode is currently supported
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Template function to run the padding function with constant padding
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename T>
+    void run_pad_constant(const Window &window);
+
+    /** Function to run the padding function with constant padding for 3D input and 1D, 2D, 3D padding
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    void run_pad_constant_uint8_3Dinput_3Dpad(const Window &window);
+
+    /** Common signature for all the specialised permute functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using PadFunctionPtr = void (NEPadLayerKernel::*)(const Window &window);
+
+    PadFunctionPtr _func;
+    const ITensor *_input;
+    ITensor       *_output;
+    PaddingList    _padding;
+    PixelValue     _constant_value;
+    PaddingMode    _mode;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEPADLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEPermuteKernel.cpp b/src/core/NEON/kernels/NEPermuteKernel.cpp
index eab11eb..6a9f5d3 100644
--- a/src/core/NEON/kernels/NEPermuteKernel.cpp
+++ b/src/core/NEON/kernels/NEPermuteKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEPermuteKernel.h"
+#include "src/core/NEON/kernels/NEPermuteKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -123,7 +123,7 @@
     // Input window
     Window window_in = window;
 
-    // we only support these two configs in arm_compute/core/NEON/kernels/convolution/common/shims.hpp, for all others
+    // we only support these two configs in src/core/NEON/kernels/convolution/common/shims.hpp, for all others
     // we have to fall back to C++
     if((input_layout == DataLayout::NCHW && _perm == PermutationVector{ 2U, 0U, 1U }) || (input_layout == DataLayout::NHWC && _perm == PermutationVector{ 1U, 2U, 0U }))
     {
diff --git a/src/core/NEON/kernels/NEPermuteKernel.h b/src/core/NEON/kernels/NEPermuteKernel.h
new file mode 100644
index 0000000..80187de
--- /dev/null
+++ b/src/core/NEON/kernels/NEPermuteKernel.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEPERMUTEKERNEL_H
+#define ARM_COMPUTE_NEPERMUTEKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** NEON kernel to perform tensor permutation.
+ *
+ * Permutes given a permutation vector
+ */
+class NEPermuteKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEPermuteKernel";
+    }
+    /** Default constructor */
+    NEPermuteKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPermuteKernel(const NEPermuteKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPermuteKernel &operator=(const NEPermuteKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEPermuteKernel(NEPermuteKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEPermuteKernel &operator=(NEPermuteKernel &&) = default;
+    /** Default destructor */
+    ~NEPermuteKernel() = default;
+
+    /** Set the input and output of the kernel.
+     *
+     * @note Arbitrary permutation vectors are supported with rank not greater than 4
+     *
+     * @param[in]  input  The input tensor to permute. Data types supported: All
+     * @param[out] output The output tensor. Data types supported: Same as @p input
+     * @param[in]  perm   Permutation vector
+     */
+    void configure(const ITensor *input, ITensor *output, const PermutationVector &perm);
+    /** Static function to check if given info will lead to a valid configuration of @ref CPPPermuteKernel
+     *
+     * @note Arbitrary permutation vectors are supported with rank not greater than 4
+     *
+     * @param[in] input  The input tensor to permute. Data types supported: All
+     * @param[in] output The output tensor. Data types supported: Same as @p input
+     * @param[in] perm   Permutation vector
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Template function to run the permute
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename T>
+    void run_permute(const Window &window);
+
+    /** Common signature for all the specialised permute functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using PermuteFunctionPtr = void (NEPermuteKernel::*)(const Window &window);
+
+    PermuteFunctionPtr _func;
+    const ITensor     *_input;
+    ITensor           *_output;
+    PermutationVector  _perm;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEPERMUTEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
index 0847cb1..8d17651 100644
--- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
+++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
+#include "src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
 
 #include "arm_compute/core/TensorInfo.h"
 #include "src/core/CPP/Validate.h"
diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
new file mode 100644
index 0000000..d414168
--- /dev/null
+++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEPIXELWISEMULTIPLICATIONKERNEL_H
+#define ARM_COMPUTE_NEPIXELWISEMULTIPLICATIONKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform addition between two tensors */
+class NEPixelWiseMultiplicationKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEPixelWiseMultiplicationKernel";
+    }
+    /** Default constructor */
+    NEPixelWiseMultiplicationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPixelWiseMultiplicationKernel(const NEPixelWiseMultiplicationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPixelWiseMultiplicationKernel &operator=(const NEPixelWiseMultiplicationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEPixelWiseMultiplicationKernel(NEPixelWiseMultiplicationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEPixelWiseMultiplicationKernel &operator=(NEPixelWiseMultiplicationKernel &&) = default;
+    /** Default destructor */
+    ~NEPixelWiseMultiplicationKernel() = default;
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *                                                       Support: Broadcast? Scale=1/255?
+     *   - (U8,U8)                         -> U8, S16                 N          Y
+     *   - (U8,S16)                        -> S16                     N          Y
+     *   - (S16,U8)                        -> S16                     N          Y
+     *   - (S16,S16)                       -> S16                     N          Y
+     *   - (S32,S32)                       -> S32                     Y          N
+     *   - (F16,F16)                       -> F16                     N          Y
+     *   - (F32,F32)                       -> F32                     Y          Y
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8                 Y          Y
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED          Y          Y
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16, S32            N          Y
+     *
+     * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
+     *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
+     *
+     * @param[in]  input1          First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+     * @param[in]  input2          Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+     * @param[out] output          Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+     * @param[in]  scale           Scale to apply after multiplication.
+     *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+     *                             If both @p input1, @p input2 and @p output are of datatype S32, scale cannot be 1/255
+     * @param[in]  overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
+     * @param[in]  rounding_policy Rounding policy.
+     */
+    void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplicationKernel
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *                                                       Support: Broadcast? Scale=1/255?
+     *   - (U8,U8)                         -> U8, S16                 N          Y
+     *   - (U8,S16)                        -> S16                     N          Y
+     *   - (S16,U8)                        -> S16                     N          Y
+     *   - (S16,S16)                       -> S16                     N          Y
+     *   - (S32,S32)                       -> S32                     Y          N
+     *   - (F16,F16)                       -> F16                     N          Y
+     *   - (F32,F32)                       -> F32                     Y          Y
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8                 Y          Y
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED          Y          Y
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16, S32            N          Y
+     *
+     * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
+     *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
+     *
+     * @param[in] input1          First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+     * @param[in] input2          Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+     * @param[in] output          Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+     * @param[in] scale           Scale to apply after multiplication.
+     *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+     *                            If both @p input1, @p input2 and @p output are of datatype S32, scale cannot be 1/255
+     * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
+     * @param[in] rounding_policy Rounding policy.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+
+    // Inherited methods overridden
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Common signature for all the specialised multiplication functions with integer scaling factor
+     *
+     * @param[in]  in1    Input1 tensor object.
+     * @param[in]  in2    Input2 tensor object.
+     * @param[out] out    Output tensor object.
+     * @param[in]  window Region on which to execute the kernel
+     * @param[in]  scale  Integer scale factor.
+     */
+    using MulFunctionInt = void(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int scale);
+    /** Common signature for all the specialised multiplication functions with float scaling factor
+     *
+     * @param[in]  in1    Input1 tensor object.
+     * @param[in]  in2    Input2 tensor object.
+     * @param[out] out    Output tensor object.
+     * @param[in]  window Region on which to execute the kernel
+     * @param[in]  scale  Float scale factor.
+     */
+    using MulFunctionFloat = void(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale);
+    /** Common signature for all the specialised QASYMM8 multiplication functions with float scaling factor
+     *
+     * @param[in]  in1    Input1 tensor object.
+     * @param[in]  in2    Input2 tensor object.
+     * @param[out] out    Output tensor object.
+     * @param[in]  window Region on which to execute the kernel
+     * @param[in]  scale  Float scale factor.
+     *
+     */
+    using MulFunctionQuantized = void(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale);
+
+    MulFunctionFloat     *_func_float;
+    MulFunctionInt       *_func_int;
+    MulFunctionQuantized *_func_quantized;
+
+private:
+    float _scale;
+    int   _scale_exponent;
+};
+
+/** Interface for the complex pixelwise multiplication kernel. */
+class NEComplexPixelWiseMultiplicationKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEComplexPixelWiseMultiplicationKernel";
+    }
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input1 An input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
+     * @param[in]  input2 An input tensor. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+     * @param[out] output The output tensor, Data types supported: same as @p input1.  Number of channels supported: same as @p input1.
+     */
+    void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEComplexPixelWiseMultiplicationKernel
+     *
+     * @param[in] input1 An input tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor).
+     * @param[in] input2 An input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+     * @param[in] output The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+};
+
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEPIXELWISEMULTIPLICATIONKERNEL_H */
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index f9636dc..0f0b9ee 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
+#include "src/core/NEON/kernels/NEPoolingLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.h b/src/core/NEON/kernels/NEPoolingLayerKernel.h
new file mode 100644
index 0000000..aa3d2f3
--- /dev/null
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.h
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H
+#define ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the pooling layer kernel */
+class NEPoolingLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEPoolingLayerKernel";
+    }
+    /** Default constructor */
+    NEPoolingLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPoolingLayerKernel(const NEPoolingLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPoolingLayerKernel &operator=(const NEPoolingLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEPoolingLayerKernel(NEPoolingLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEPoolingLayerKernel &operator=(NEPoolingLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEPoolingLayerKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @note F16 are supported for pool sizes 2 and 3 only
+     *
+     * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[out] indices   (optional) The indices of the maximal values. Data type supported: U32.
+     */
+    void configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info, ITensor *indices = nullptr);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEPoolingLayerKernel
+     *
+     * @note F16 are supported for pool sizes 2 and 3 only
+     *
+     * @param[in] input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[in] indices   (optional) The indices of the maximal values. Data type supported: U32.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Function to perform 2x2 pooling.
+     *
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    void pooling2_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform 2x2 pooling and compute the pooling indices. The indices can be used for max unpool.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    void pooling2_f32_nhwc_maxpool_indices(const Window &window_input, const Window &window);
+    /** Function to perform MxN pooling for 32-bit floating point values.
+     *
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    void poolingMxN_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform MxN pooling for 32-bit floating point values (NHWC).
+     *
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    void poolingMxN_f32_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform 7x7 pooling.
+     *
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    void pooling7_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform 3x3 pooling.
+     *
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    void pooling3_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform 2x2 pooling for float16_t.
+     *
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    void pooling2_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform 2x2 pooling and compute the pooling indices for FP32/FP16. The indices can be used for max unpool.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    template <typename T>
+    void pooling2_nchw_maxpool_indices(const Window &window_input, const Window &window);
+    /** Function to perform 2x2 pooling and compute the pooling indices. The indices can be used for max unpool.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    void pooling2_f16_nhwc_maxpool_indices(const Window &window_input, const Window &window);
+    /** Function to perform 3x3 pooling.
+     *
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    void pooling3_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform MxN pooling for 16-bit floating point values.
+     *
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    void poolingMxN_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform MxN pooling for 16-bit floating point values. (NHWC)
+     *
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    void poolingMxN_f16_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Template function to perform 2x2 pooling for 8bit quantized fixed point. (NCHW)
+     *
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    template <typename T>
+    void pooling2_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Template function to perform 3x3 pooling for 8bit quantized fixed point. (NCHW)
+     *
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    template <typename T>
+    void pooling3_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Template function to perform MxN pooling for 8-bit quantized. (NCHW)
+     *
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    template <typename T>
+    void poolingMxN_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Template function to perform MxN pooling for 8-bit quantized. (NHWC)
+     *
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    template <typename T>
+    void poolingMxN_q8_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Common signature for all the specialised Pooling functions
+     *
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    using PoolingFunction = void (NEPoolingLayerKernel::*)(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding);
+
+private:
+    PoolingFunction  _func;
+    const ITensor   *_input;
+    ITensor         *_output;
+    ITensor         *_indices;
+    PoolingLayerInfo _pool_info;
+    DataLayout       _data_layout;
+    unsigned int     _num_elems_processed_per_iteration;
+    BorderSize       _border_size;
+    bool             _is_square;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
index 06a1f14..6757aff 100644
--- a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEPriorBoxLayerKernel.h"
+#include "src/core/NEON/kernels/NEPriorBoxLayerKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.h b/src/core/NEON/kernels/NEPriorBoxLayerKernel.h
new file mode 100644
index 0000000..430a47f
--- /dev/null
+++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEPRIORBOXLAYERKERNEL_H
+#define ARM_COMPUTE_NEPRIORBOXLAYERKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to calculate prior boxes */
+class NEPriorBoxLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEPriorBoxLayerKernel";
+    }
+    /** Default constructor */
+    NEPriorBoxLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPriorBoxLayerKernel(const NEPriorBoxLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPriorBoxLayerKernel &operator=(const NEPriorBoxLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEPriorBoxLayerKernel(NEPriorBoxLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEPriorBoxLayerKernel &operator=(NEPriorBoxLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEPriorBoxLayerKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input1 First source tensor. Data types supported: F32. Data layouts supported: NCHW/NHWC.
+     * @param[in]  input2 Second source tensor. Data types and layouts supported: same as @p input1
+     * @param[out] output Destination tensor. Output dimensions are [W * H * num_priors * 4, 2]. Data type supported: same as @p input
+     * @param[in]  info   Prior box layer info.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEPriorBoxLayerKernel
+     *
+     * @param[in] input1 First source tensor info. Data types supported: F32. Data layouts supported: NCHW/NHWC.
+     * @param[in] input2 Second source tensor info. Data types and layouts supported: same as @p input1
+     * @param[in] output Destination tensor info. Output dimensions are [W * H * num_priors * 4, 2]. Data type supported: same as @p input
+     * @param[in] info   Prior box layer info.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Stores the coordinates of the calculated prior boxes.
+     *
+     * @param[out] out        Output pointer.
+     * @param[in]  offset     Output offset to write to.
+     * @param[in]  center_x   Center pixel value on x-axis.
+     * @param[in]  center_y   Center pixel value on y-axis.
+     * @param[in]  box_width  Prior box width.
+     * @param[in]  box_height Prior box height.
+     * @param[in]  width      Input width.
+     * @param[in]  height     Input height.
+     */
+    void store_coordinates(float *out, const int offset, const float center_x, const float center_y, const float box_width, const float box_height, const int width, const int height);
+    /** Function to calculate prior boxes.
+     *
+     * @param[in] window Input region on which to execute the kernel.
+     */
+    void calculate_prior_boxes(const Window &window);
+
+    const ITensor    *_input1;
+    const ITensor    *_input2;
+    ITensor          *_output;
+    PriorBoxLayerInfo _info;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEPRIORBOXLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
index 55585b4..8c1c8cf 100644
--- a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
+#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
new file mode 100644
index 0000000..ba68171
--- /dev/null
+++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEQLSTMLAYERNORMALIZATIONKERNEL_H
+#define ARM_COMPUTE_NEQLSTMLAYERNORMALIZATIONKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+#include <functional>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform layer normalization */
+class NEQLSTMLayerNormalizationKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEQLSTMLayerNormalizationKernel";
+    }
+    /** Default constructor */
+    NEQLSTMLayerNormalizationKernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEQLSTMLayerNormalizationKernel(const NEQLSTMLayerNormalizationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEQLSTMLayerNormalizationKernel &operator=(const NEQLSTMLayerNormalizationKernel &) = delete;
+    /** Default Move Constructor. */
+    NEQLSTMLayerNormalizationKernel(NEQLSTMLayerNormalizationKernel &&) = default;
+    /** Default move assignment operator */
+    NEQLSTMLayerNormalizationKernel &operator=(NEQLSTMLayerNormalizationKernel &&) = default;
+    /** Default destructor */
+    ~NEQLSTMLayerNormalizationKernel() = default;
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: QSYMM16.
+     * @param[out] output Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  weight Weight tensor. Data types supported: Same as @p input.
+     * @param[in]  bias   Bias tensor. Data types supported: S32
+     */
+    void configure(const ITensor *input, ITensor *output, const ITensor *weight, const ITensor *bias);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEQLSTMLayerNormalizationKernel
+     *
+     * @param[in] input  Source tensor info. Data types supported: QSYMM16.
+     * @param[in] output Destination tensor info. Data types supported: Same as @p input.
+     * @param[in] weight Weight tensor info. Data types supported: Same as @p input.
+     * @param[in] bias   Bias tensor info. Data types supported: S32
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    // constants
+    static constexpr uint32_t max_input_dimension{ 2 };  /**< The maximum input dimension supported */
+    static constexpr uint32_t max_weight_dimension{ 1 }; /**< The maximum weight dimension supported */
+    static constexpr uint32_t max_bias_dimension{ 1 };   /**< The maximum bias dimension supported */
+    static constexpr uint32_t vector_size_byte{ 16 };    /**< Computation vector size in byte */
+
+    using ComputeFuncType = std::function<void(NEQLSTMLayerNormalizationKernel &)>;
+
+    ComputeFuncType _fn{}; /**< Function pointer to computation function */
+
+    const ITensor *_input
+    {
+        nullptr
+    }; /**< Input tensor */
+    const ITensor *_weight
+    {
+        nullptr
+    }; /**< Weight tensor */
+    const ITensor *_bias
+    {
+        nullptr
+    };                           /**< Bias tensor */
+    ITensor *_output{ nullptr }; /**< Output tensor */
+
+    int32_t _output_multiplier{}; /**< Multiplier for output values */
+    int32_t _output_shift{};      /**< Shift value for output values */
+
+    int32_t _window_start_x{}; /**< The beginning of x-axis iteration */
+    int32_t _window_end_x{};   /**< The end of x-axis iteration */
+    int32_t _window_step_x{};  /**< The size of x-axis iteration's step */
+
+    Window _inout_window{};  /**< Window for input and output tensor */
+    Window _weight_window{}; /**< Window for weight and bias tensor */
+
+    /** Function to configure initial windows for destination of computation
+     *
+     * @param[in] Target destination tensor to use for output window
+     *
+     * @return configured window
+     */
+    Window configure_window(ITensor *target);
+    // Function to compute for data type QSYMM16
+    void compute_qsymm16();
+    /** Function to compute summation and summation of squared input of the given input pointer
+     *
+     * @param[in] Input_ptr pointer to input array
+     *
+     */
+    std::pair<int64_t, int64_t> sum_qsymm16(const int16_t *input_ptr);
+    /** Function to normalize values using computed mean and standard deviation
+     *
+     * @param[in] input_ptr     Pointer to input array
+     * @param[in] output_ptr    Pointer to output array
+     * @param[in] weight_ptr    Pointer to weight array
+     * @param[in] bias_ptr      Pointer to bias array
+     * @param[in] mean          Mean value
+     * @param[in] inv_std_mul   Quantized multiplier for standard deviation
+     * @param[in] inv_std_shift Shift for standard deviation
+     *
+     */
+    void normalize_qasymm16(const int16_t *input_ptr,
+                            int16_t       *output_ptr,
+                            const int16_t *weight_ptr,
+                            const int32_t *bias_ptr,
+                            int32_t mean, int32_t inv_std_mul, int32_t inv_std_shift);
+    /** Function to compute output quantization information */
+    QuantizationInfo compute_output_qinfo();
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEQLSTMLAYERNORMALIZATIONKERNEL_H */
diff --git a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
index 990e4b6..ff3d9ff 100644
--- a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h"
+#include "src/core/NEON/kernels/NEQuantizationLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEQuantizationLayerKernel.h b/src/core/NEON/kernels/NEQuantizationLayerKernel.h
new file mode 100644
index 0000000..5ee0ed4
--- /dev/null
+++ b/src/core/NEON/kernels/NEQuantizationLayerKernel.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEQUANTIZATIONLAYERKERNEL_H
+#define ARM_COMPUTE_NEQUANTIZATIONLAYERKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the quantization layer kernel.
+ *
+ * @note The implementation supports only 3D input tensors
+ *
+ */
+class NEQuantizationLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEQuantizationLayerKernel";
+    }
+    /** Default constructor */
+    NEQuantizationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEQuantizationLayerKernel(const NEQuantizationLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEQuantizationLayerKernel &operator=(const NEQuantizationLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    NEQuantizationLayerKernel(NEQuantizationLayerKernel &&) = default;
+    /** Default move assignment operator */
+    NEQuantizationLayerKernel &operator=(NEQuantizationLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEQuantizationLayerKernel() = default;
+    /** Set the input, output.
+     *
+     * @param[in]  input  Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
+     * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
+     *
+     * @note Output auto initialization is not supported by this kernel
+     */
+    void configure(const ITensor *input, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEQuantizationLayerKernel
+     *
+     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
+     * @param[in] output Output tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Common signature for all the specialised @ref NEQuantizationLayerKernel functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using QuantizationFunctionExecutorPtr = void (NEQuantizationLayerKernel::*)(const Window &window);
+    /** Function to apply QASYMM8 or QASYMM8_SIGNED quantization on a tensor.
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    template <typename TIn, typename TOut>
+    void run_quantize_qasymm8(const Window &window);
+    /** Function to apply QASYMM16 quantization on a tensor.
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    template <typename T>
+    void run_quantize_qasymm16(const Window &window);
+
+    const ITensor *_input;
+    ITensor       *_output;
+
+    QuantizationFunctionExecutorPtr _func;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEQUANTIZATIONLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
index 79f7888..c48cda8 100644
--- a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEROIAlignLayerKernel.h"
+#include "src/core/NEON/kernels/NEROIAlignLayerKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.h b/src/core/NEON/kernels/NEROIAlignLayerKernel.h
new file mode 100644
index 0000000..d909fb1
--- /dev/null
+++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEROIALIGNLAYERKERNEL_H
+#define ARM_COMPUTE_NEROIALIGNLAYERKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the RoIAlign kernel.
+ */
+class NEROIAlignLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEROIAlignLayerKernel";
+    }
+
+    /** Constructor */
+    NEROIAlignLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEROIAlignLayerKernel(const NEROIAlignLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEROIAlignLayerKernel &operator=(const NEROIAlignLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    NEROIAlignLayerKernel(NEROIAlignLayerKernel &&) = default;
+    /** Default move assignment operator. */
+    NEROIAlignLayerKernel &operator=(NEROIAlignLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEROIAlignLayerKernel() = default;
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. Data types supported: QASYMM8/F16/F32.
+     * @param[in]  rois      ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
+     *                       as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ].
+     *                       Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8, otherwise same as @p input
+     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
+     *
+     * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
+     * width and pooled height.
+     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
+     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
+     */
+    void configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEROIAlignLayerKernel
+     *
+     * @param[in] input     Source tensor info. Data types supported: QASYMM8/F16/F32.
+     * @param[in] rois      ROIs tensor info. Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8,
+     *                      otherwise same as @p input
+     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
+     * @param[in] pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
+     *
+     * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
+     * width and pooled height.
+     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
+     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
+     *
+     * @return a Status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    template <DataLayout data_layout, typename input_data_type, typename roi_data_type = input_data_type>
+    void internal_run(const Window &window, const ThreadInfo &info);
+
+    const ITensor      *_input;
+    ITensor            *_output;
+    const ITensor      *_rois;
+    ROIPoolingLayerInfo _pool_info;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEROIALIGNLAYERKERNEL_H*/
diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
index a3171d9..40dae82 100644
--- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h"
+#include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.h b/src/core/NEON/kernels/NEROIPoolingLayerKernel.h
new file mode 100644
index 0000000..3642417
--- /dev/null
+++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEROIPOOLINGLAYERKERNEL_H
+#define ARM_COMPUTE_NEROIPOOLINGLAYERKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+#include "arm_compute/core/IArray.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the ROI pooling layer kernel */
+class NEROIPoolingLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEROIPoolingLayerKernel";
+    }
+    /** Default constructor */
+    NEROIPoolingLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEROIPoolingLayerKernel(const NEROIPoolingLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEROIPoolingLayerKernel &operator=(const NEROIPoolingLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEROIPoolingLayerKernel(NEROIPoolingLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEROIPoolingLayerKernel &operator=(NEROIPoolingLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEROIPoolingLayerKernel() = default;
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. Data types supported: F32.
+     * @param[in]  rois      ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
+     *                       as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ]. Data types supported: U16
+     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
+     *
+     * @note The x and y dimensions of @p output tensor must be the same as that specified by @p pool_info 's pooled
+     * width and pooled height.
+     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
+     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois tensor.
+     */
+    void configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor      *_input;
+    const ITensor      *_rois;
+    ITensor            *_output;
+    ROIPoolingLayerInfo _pool_info;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEROIPOOLINGLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NERangeKernel.cpp b/src/core/NEON/kernels/NERangeKernel.cpp
index 3466794..8d11122 100644
--- a/src/core/NEON/kernels/NERangeKernel.cpp
+++ b/src/core/NEON/kernels/NERangeKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NERangeKernel.h"
+#include "src/core/NEON/kernels/NERangeKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NERangeKernel.h b/src/core/NEON/kernels/NERangeKernel.h
new file mode 100644
index 0000000..7c42ef1
--- /dev/null
+++ b/src/core/NEON/kernels/NERangeKernel.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NERANGEKERNEL_H
+#define ARM_COMPUTE_NERANGEKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Kernel class for Range
+ *
+ * range generates a 1-D tensor containing a sequence of numbers that begins at 'start' and extends by increments
+ * of 'step' up to but not including 'end'.
+ */
+class NERangeKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NERangeKernel";
+    }
+    /** Default constructor */
+    NERangeKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NERangeKernel(const NERangeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NERangeKernel &operator=(const NERangeKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NERangeKernel(NERangeKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NERangeKernel &operator=(NERangeKernel &&) = default;
+    /** Default destructor */
+    ~NERangeKernel() = default;
+    /** Initialize the kernel's output tensor, start, end and step of the sequence.
+     *
+     * @param[out] output Output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+     * @param[in]  start  The starting value of the sequence.
+     * @param[in]  end    The ending (not including) value of the sequence.
+     * @param[in]  step   The gap between each pair of values in the sequence.
+     */
+    void configure(ITensor *output, float start, float end, float step);
+    /** Static function to check if given info will lead to a valid configuration of @ref NERangeKernel
+     *
+     * @param[in] output Output tensor info. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+     * @param[in] start  The starting value of the sequence.
+     * @param[in] end    The ending (not including) value of the sequence.
+     * @param[in] step   The gap between each pair of values in the sequence.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *output, float start, float end, float step);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    using RangeFunction = void(ITensor *output, float start, float step, const Window &window);
+
+    RangeFunction *_func;   /**< Range function to be called */
+    float          _start;  /**< Start of sequence */
+    float          _end;    /**< End of sequence */
+    float          _step;   /**< Increment/step value */
+    ITensor       *_output; /**< Destination tensor */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NERANGEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 716b092..4e63dd9 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -21,18 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEReductionOperationKernel.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/CPP/Validate.h"
+#include "src/core/NEON/INEKernel.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.h b/src/core/NEON/kernels/NEReductionOperationKernel.h
new file mode 100644
index 0000000..dfc105a
--- /dev/null
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEREDUCTIONOPERATIONKERNEL_H
+#define ARM_COMPUTE_NEREDUCTIONOPERATIONKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform a reduction operation
+ *
+ * @note For ARG_MIN/ARG_MAX reduction, the default data type for an uninitialized
+ *       output tensor is signed 32-bit integer (S32). It is the user's responsibility
+ *       to check that the results do not overflow because the indices are computed
+ *       in unsigned 32-bit (U32).
+ */
+class NEReductionOperationKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEReductionOperationKernel";
+    }
+    /** Default constructor */
+    NEReductionOperationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReductionOperationKernel(const NEReductionOperationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReductionOperationKernel &operator=(const NEReductionOperationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEReductionOperationKernel(NEReductionOperationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEReductionOperationKernel &operator=(NEReductionOperationKernel &&) = default;
+    /** Default destructor */
+    ~NEReductionOperationKernel() = default;
+
+    /** Set the source, destination of the kernel
+     *
+     * @param[in]  input  Source tensor. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32/S32. Data layouts supported: NCHW.
+     * @param[out] output Destination tensor.Data types and data layouts supported: same as @p input, S32 for ARG_MIX/ARG_MAX.
+     *                    Output will have the same number of dimensions as input.
+     * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0
+     * @param[in]  op     Reduction operation to perform.
+     */
+    void configure(const ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEReductionOperationKernel.
+     *
+     * @param[in] input  Source tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32/S32. Data layouts supported: NCHW.
+     * @param[in] output Destination tensor info.Data types and data layouts supported: same as @p input, S32 for ARG_MIX/ARG_MAX.
+     *                   Output will have the same number of dimensions as input.
+     * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0
+     * @param[in] op     Reduction operation to perform.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor     *_input;
+    ITensor           *_output;
+    unsigned int       _reduction_axis;
+    ReductionOperation _op;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEREDUCTIONOPERATIONKERNEL_H */
diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp
index f698439..b334a11 100644
--- a/src/core/NEON/kernels/NERemapKernel.cpp
+++ b/src/core/NEON/kernels/NERemapKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NERemapKernel.h"
+#include "src/core/NEON/kernels/NERemapKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NERemapKernel.h b/src/core/NEON/kernels/NERemapKernel.h
new file mode 100644
index 0000000..8fe1ba5
--- /dev/null
+++ b/src/core/NEON/kernels/NERemapKernel.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEREMAPKERNEL_H
+#define ARM_COMPUTE_NEREMAPKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform a remap on a tensor */
+class NERemapKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NERemapKernel";
+    }
+    /** Default constructor */
+    NERemapKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NERemapKernel(const NERemapKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NERemapKernel &operator=(const NERemapKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NERemapKernel(NERemapKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NERemapKernel &operator=(NERemapKernel &&) = default;
+    /** Default destructor */
+    ~NERemapKernel() = default;
+
+    /** Initialize the kernel's input, output and border mode.
+     *
+     * @param[in]  input  Source tensor. Data type supported: U8.
+     * @param[in]  map_x  Map for X coordinates. Data type supported: F32.
+     * @param[in]  map_y  Map for Y coordinates. Data type supported: F32.
+     * @param[out] output Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
+     * @param[in]  policy The interpolation type.
+     */
+    void configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+private:
+    /** function to perform nearest interpolation on the given window */
+    void remap_nearest(const Window &window);
+    /** function to perform bilinear interpolation on the given window */
+    void remap_bilinear(const Window &window);
+    /** Remap function to use for the particular interpolation type passed to configure() */
+    void (NERemapKernel::*_func)(const Window &window);
+
+    const ITensor *_input;  /**< Input image */
+    ITensor       *_output; /**< Output image */
+    const ITensor *_map_x;  /**< Input remap x coordinates */
+    const ITensor *_map_y;  /**< Input remap y coordinates */
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEREMAPKERNEL_H */
diff --git a/src/core/NEON/kernels/NEReorgLayerKernel.cpp b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
index 1c48a5c..0dcb439 100644
--- a/src/core/NEON/kernels/NEReorgLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEReorgLayerKernel.h"
+#include "src/core/NEON/kernels/NEReorgLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEReorgLayerKernel.h b/src/core/NEON/kernels/NEReorgLayerKernel.h
new file mode 100644
index 0000000..eac9115
--- /dev/null
+++ b/src/core/NEON/kernels/NEReorgLayerKernel.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEREORGLAYERKERNEL_H
+#define ARM_COMPUTE_NEREORGLAYERKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Interface for the kernel to perform tensor re-organization */
+class NEReorgLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEReorgLayerKernel";
+    }
+    /** Default constructor */
+    NEReorgLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReorgLayerKernel(const NEReorgLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReorgLayerKernel &operator=(const NEReorgLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    NEReorgLayerKernel(NEReorgLayerKernel &&) = default;
+    /** Default move assignment operator */
+    NEReorgLayerKernel &operator=(NEReorgLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEReorgLayerKernel() = default;
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  Source tensor. Data type supported: All
+     * @param[out] output Destination tensor. Data type supported: Same as @p input
+     * @param[in]  stride Stride to be used during data re-organization.
+     *                    It defines the spatial distance between 2 consecutive pixels in the x and y direction
+     */
+    void configure(const ITensor *input, ITensor *output, int32_t stride);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEReshapeLayerKernel
+     *
+     * @param[in] input  Source tensor info. Data type supported: All
+     * @param[in] output Destination tensor info. Data type supported: Same as @p input
+     * @param[in] stride Stride to be used during data re-organization
+     *                   It defines the spatial distance between 2 consecutive pixels in the x and y direction
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t stride);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input;
+    ITensor       *_output;
+    int32_t        _stride;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEREORGLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
index 7946812..462404f 100644
--- a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
@@ -21,18 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
+#include "src/core/NEON/kernels/NEReshapeLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CPP/Validate.h"
+#include "src/core/NEON/INEKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
diff --git a/src/core/NEON/kernels/NEReshapeLayerKernel.h b/src/core/NEON/kernels/NEReshapeLayerKernel.h
new file mode 100644
index 0000000..ecec8d9
--- /dev/null
+++ b/src/core/NEON/kernels/NEReshapeLayerKernel.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NERESHAPELAYERKERNEL_H
+#define ARM_COMPUTE_NERESHAPELAYERKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Interface for the kernel to perform tensor reshaping */
+class NEReshapeLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEReshapeLayerKernel";
+    }
+    /** Default constructor */
+    NEReshapeLayerKernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReshapeLayerKernel(const NEReshapeLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReshapeLayerKernel &operator=(const NEReshapeLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEReshapeLayerKernel(NEReshapeLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEReshapeLayerKernel &operator=(NEReshapeLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEReshapeLayerKernel() = default;
+    /** Set the input and output info of the kernel
+     *
+     * @param[in]  input  Source tensor info. Data type supported: All
+     * @param[out] output Destination tensor info. Data type supported: Same as @p input
+     */
+    void configure(const ITensorInfo *input, ITensorInfo *output);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEReshapeLayerKernel
+     *
+     * @param[in] input  Source tensor info. Data type supported: All
+     * @param[in] output Destination tensor info. Data type supported: Same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NERESHAPELAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEReverseKernel.cpp b/src/core/NEON/kernels/NEReverseKernel.cpp
index 2c081cb..21c7580 100644
--- a/src/core/NEON/kernels/NEReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEReverseKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEReverseKernel.h"
+#include "src/core/NEON/kernels/NEReverseKernel.h"
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
diff --git a/src/core/NEON/kernels/NEReverseKernel.h b/src/core/NEON/kernels/NEReverseKernel.h
new file mode 100644
index 0000000..07b547a
--- /dev/null
+++ b/src/core/NEON/kernels/NEReverseKernel.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEREVERSEKERNEL_H
+#define ARM_COMPUTE_NEREVERSEKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the reverse layer kernel. */
+class NEReverseKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEReverseKernel";
+    }
+    /** Default constructor */
+    NEReverseKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReverseKernel(const NEReverseKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReverseKernel &operator=(const NEReverseKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEReverseKernel(NEReverseKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEReverseKernel &operator=(NEReverseKernel &&) = default;
+    /** Default destructor */
+    ~NEReverseKernel() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]  input  Input tensor. Data types supported: All
+     * @param[out] output Output tensor. Data type supported: Same as @p input
+     * @param[in]  axis   Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
+     */
+    void configure(const ITensor *input, ITensor *output, const ITensor *axis);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEReverseKernel
+     *
+     * @param[in] input  Input tensor info. Data types supported: All
+     * @param[in] output Output tensor info. Data type supported: Same as @p input
+     * @param[in] axis   Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input;
+    ITensor       *_output;
+    const ITensor *_axis;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEREVERSEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
index 2e40759..5a6d49b 100644
--- a/src/core/NEON/kernels/NEScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
+#include "src/core/NEON/kernels/NEScaleKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
diff --git a/src/core/NEON/kernels/NEScaleKernel.h b/src/core/NEON/kernels/NEScaleKernel.h
new file mode 100644
index 0000000..a3786db
--- /dev/null
+++ b/src/core/NEON/kernels/NEScaleKernel.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NESCALEKERNEL_H
+#define ARM_COMPUTE_NESCALEKERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform scaling on a tensor */
+class NEScaleKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEScaleKernel";
+    }
+    /** Default constructor */
+    NEScaleKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEScaleKernel(const NEScaleKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEScaleKernel &operator=(const NEScaleKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEScaleKernel(NEScaleKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEScaleKernel &operator=(NEScaleKernel &&) = default;
+    /** Default destructor */
+    ~NEScaleKernel() = default;
+
+    /** Initialise the kernel's inputs, output and interpolation policy
+     *
+     * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor
+     * @note Using @p policy Area only supports data layout NCHW and input data type U8.
+     *
+     * @param[in]  input   Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32.
+     * @param[in]  dx      Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32
+     * @param[in]  dy      Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32
+     * @param[in]  offsets Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32.
+     * @param[out] output  Destination tensor. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]  info    @ref ScaleKernelInfo to use for configuration
+     */
+    void configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, ITensor *output,
+                   const ScaleKernelInfo &info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEScaleKernel
+     *
+     * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor
+     * @note Using @p policy Area only supports data layout NCHW and input data type U8.
+     *
+     * @param[in] input   Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32.
+     * @param[in] dx      Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32
+     * @param[in] dy      Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32
+     * @param[in] offsets Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32.
+     * @param[in] output  Destination tensor. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in] info    @ref ScaleKernelInfo to use for validation
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *output,
+                           const ScaleKernelInfo &info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** function to perform scale using area interpolation on the given window
+     *
+     *  @note Used only in case down-sampling.
+     */
+    void scale_area_nchw_u8(const Window &window);
+
+    /** function to perform scale using bilinear interpolation on the given window */
+    template <typename T>
+    void scale_bilinear_nchw(const Window &window);
+    /** function to perform scale using bilinear interpolation on the given window */
+    template <typename T>
+    void scale_bilinear_nhwc(const Window &window);
+    /** function to perform scale using bilinear interpolation on the given window */
+    template <typename T>
+    void scale_bilinear_qasymm(const Window &window);
+
+    /** function to perform scale using nearest neighbour on the given window */
+    template <typename T>
+    void scale_nearest_nchw(const Window &window);
+    /** function to perform scale using nearest neighbour on the given window */
+    template <typename T>
+    void scale_nearest_nhwc(const Window &window);
+
+    /** Scale function to use for the particular function to use */
+    using ScaleFunctionPtr = void (NEScaleKernel::*)(const Window &window);
+
+    ScaleFunctionPtr    _func;
+    const ITensor      *_offsets;
+    const ITensor      *_dx;
+    const ITensor      *_dy;
+    const ITensor      *_input;
+    ITensor            *_output;
+    InterpolationPolicy _policy;
+    BorderMode          _border_mode;
+    PixelValue          _constant_border_value;
+    float               _sampling_offset;
+    bool                _align_corners;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NESCALEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEScharr3x3Kernel.cpp b/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
index eb1dc65..58b8caa 100644
--- a/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h"
+#include "src/core/NEON/kernels/NEScharr3x3Kernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
diff --git a/src/core/NEON/kernels/NEScharr3x3Kernel.h b/src/core/NEON/kernels/NEScharr3x3Kernel.h
new file mode 100644
index 0000000..920410e
--- /dev/null
+++ b/src/core/NEON/kernels/NEScharr3x3Kernel.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NESCHARR3x3KERNEL_H
+#define ARM_COMPUTE_NESCHARR3x3KERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to run a 3x3 Scharr filter on a tensor.
+ *
+* @f[
+*      \mathbf{G}_x=\begin{vmatrix}
+*      -3 & 0 & +3\\
+*      -10& 0 & +10\\
+*      -3 & 0 & +3
+*      \end{vmatrix}
+* @f]
+*/
+class NEScharr3x3Kernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEScharr3x3Kernel";
+    }
+    /** Default constructor */
+    NEScharr3x3Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEScharr3x3Kernel(const NEScharr3x3Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEScharr3x3Kernel &operator=(const NEScharr3x3Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEScharr3x3Kernel(NEScharr3x3Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEScharr3x3Kernel &operator=(NEScharr3x3Kernel &&) = default;
+    /** Default destructor */
+    ~NEScharr3x3Kernel() = default;
+
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+private:
+    bool           _run_scharr_x; /**< Do we need to run Scharr X ? */
+    bool           _run_scharr_y; /**< Do we need to run Scharr Y ? */
+    const ITensor *_input;        /**< Input tensor */
+    ITensor       *_output_x;     /**< Output tensor for scharr X */
+    ITensor       *_output_y;     /**< Output tensor for scharr Y */
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NESCHARR3x3KERNEL_H */
diff --git a/src/core/NEON/kernels/NESelectKernel.cpp b/src/core/NEON/kernels/NESelectKernel.cpp
index 2f36db2..9cf9b98 100644
--- a/src/core/NEON/kernels/NESelectKernel.cpp
+++ b/src/core/NEON/kernels/NESelectKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NESelectKernel.h"
+#include "src/core/NEON/kernels/NESelectKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NESelectKernel.h b/src/core/NEON/kernels/NESelectKernel.h
new file mode 100644
index 0000000..f7142fe
--- /dev/null
+++ b/src/core/NEON/kernels/NESelectKernel.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NESELECTKERNEL_H
+#define ARM_COMPUTE_NESELECTKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Interface for the select kernel
+ *
+ * Select is computed by:
+ * @f[ output(i) = condition(i) ? x(i) : y(i) @f]
+ *
+ */
+class NESelectKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NESelectKernel";
+    }
+    /** Default constructor */
+    NESelectKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESelectKernel(const NESelectKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESelectKernel &operator=(const NESelectKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NESelectKernel(NESelectKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NESelectKernel &operator=(NESelectKernel &&) = default;
+    /** Default destructor */
+    ~NESelectKernel() = default;
+
+    /** Common signature for all the specialised elementwise functions
+     *
+     * @param[in]  c      Condition input tensor. Data types supported: U8.
+     * @param[in]  x      First input tensor. Data types supported: All.
+     * @param[out] y      Second input tensor. Data types supported: Same as @p x
+     * @param[in]  output Output tensor. Data types supported: Same as @p x
+     */
+    void configure(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output);
+
+    /** Validate the argument passed to the kernel
+     *
+     * @param[in] c      Condition input tensor. Data types supported: U8.
+     * @param[in] x      First input tensor. Data types supported: All.
+     * @param[in] y      Second input tensor. Data types supported: Same as @p x
+     * @param[in] output Output tensor. Data types supported: Same as @p x.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Common signature for all the specialised select functions
+     *
+     * @param[in] c      Condition input tensor. Data types supported: U8.
+     * @param[in] x      First input tensor. Data types supported: All.
+     * @param[in] y      Second input tensor. Data types supported: Same as @p x
+     * @param[in] output Output tensor. Data types supported: Same as @p x.
+     */
+    using SelectFunction = void(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window);
+
+    /** Select function to use for the particular tensor types passed to configure() */
+    SelectFunction *_function;
+    const ITensor *_c;              /**< Condition tensor */
+    const ITensor *_x;              /**< Source tensor 1 */
+    const ITensor *_y;              /**< Source tensor 2 */
+    ITensor        *_output;        /**< Destination tensor */
+    bool            _has_same_rank; /**< Flag that indicates if condition tensor and other inputs have the same rank */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NESELECTKERNEL_H */
diff --git a/src/core/NEON/kernels/NESobel3x3Kernel.cpp b/src/core/NEON/kernels/NESobel3x3Kernel.cpp
index 1c7089b..ecf6b59 100644
--- a/src/core/NEON/kernels/NESobel3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NESobel3x3Kernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NESobel3x3Kernel.h"
+#include "src/core/NEON/kernels/NESobel3x3Kernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
diff --git a/src/core/NEON/kernels/NESobel3x3Kernel.h b/src/core/NEON/kernels/NESobel3x3Kernel.h
new file mode 100644
index 0000000..2c3eaf5
--- /dev/null
+++ b/src/core/NEON/kernels/NESobel3x3Kernel.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NESOBEL3x3KERNEL_H
+#define ARM_COMPUTE_NESOBEL3x3KERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to run a 3x3 Sobel X filter on a tensor.
+ *
+ * @f[
+ *      \mathbf{G}_x=\begin{vmatrix}
+ *      -1 & 0 & +1\\
+ *      -2 & 0 & +2\\
+ *      -1 & 0 & +1
+ *      \end{vmatrix}
+ * @f]
+*/
+class NESobel3x3Kernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NESobel3x3Kernel";
+    }
+    /** Default constructor */
+    NESobel3x3Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel3x3Kernel(const NESobel3x3Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel3x3Kernel &operator=(const NESobel3x3Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NESobel3x3Kernel(NESobel3x3Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NESobel3x3Kernel &operator=(NESobel3x3Kernel &&) = default;
+    /** Default destructor */
+    ~NESobel3x3Kernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+private:
+    bool           _run_sobel_x; /**< Do we need to run Sobel X ? */
+    bool           _run_sobel_y; /**< Do we need to run Sobel Y ? */
+    const ITensor *_input;       /**< Input tensor */
+    ITensor       *_output_x;    /**< Output tensor for sobel X */
+    ITensor       *_output_y;    /**< Output tensor for sobel Y */
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NESOBEL3x3KERNEL_H */
diff --git a/src/core/NEON/kernels/NESobel5x5Kernel.cpp b/src/core/NEON/kernels/NESobel5x5Kernel.cpp
index 2421ea7..5a66b1f 100644
--- a/src/core/NEON/kernels/NESobel5x5Kernel.cpp
+++ b/src/core/NEON/kernels/NESobel5x5Kernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NESobel5x5Kernel.h"
+#include "src/core/NEON/kernels/NESobel5x5Kernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
diff --git a/src/core/NEON/kernels/NESobel5x5Kernel.h b/src/core/NEON/kernels/NESobel5x5Kernel.h
new file mode 100644
index 0000000..bd5eb29
--- /dev/null
+++ b/src/core/NEON/kernels/NESobel5x5Kernel.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NESOBEL5x5KERNEL_H
+#define ARM_COMPUTE_NESOBEL5x5KERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to run the horizontal pass of 5x5 Sobel filter on a tensor.
+ *
+ */
+class NESobel5x5HorKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NESobel5x5HorKernel";
+    }
+    /** Default constructor */
+    NESobel5x5HorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel5x5HorKernel(const NESobel5x5HorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel5x5HorKernel &operator=(const NESobel5x5HorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NESobel5x5HorKernel(NESobel5x5HorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NESobel5x5HorKernel &operator=(NESobel5x5HorKernel &&) = default;
+    /** Default destructor */
+    ~NESobel5x5HorKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @note At least one of output_x or output_y must be set
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+private:
+    const ITensor *_input;       /**< Input tensor */
+    ITensor       *_output_x;    /**< X output of horizontal pass */
+    ITensor       *_output_y;    /**< Y output of horizontal pass */
+    bool           _run_sobel_x; /**< Do we need to run Sobel X? */
+    bool           _run_sobel_y; /**< Do we need to run Sobel Y? */
+    BorderSize     _border_size; /**< Border size */
+};
+
+/** Interface for the kernel to run the vertical pass of 5x5 Sobel Y filter on a tensor.
+ *
+*/
+class NESobel5x5VertKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NESobel5x5VertKernel";
+    }
+    /** Default constructor */
+    NESobel5x5VertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel5x5VertKernel(const NESobel5x5VertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel5x5VertKernel &operator=(const NESobel5x5VertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NESobel5x5VertKernel(NESobel5x5VertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NESobel5x5VertKernel &operator=(NESobel5x5VertKernel &&) = default;
+    /** Default destructor */
+    ~NESobel5x5VertKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  input_x          Input for X (X output of hor pass). Data type supported: S16.
+     * @param[in]  input_y          Input for Y (Y output of hor pass). Data type supported: S16.
+     * @param[out] output_x         Destination tensor for the X gradient. Data type supported: S16.
+     * @param[out] output_y         Destination tensor for the Y gradient. Data type supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(ITensor *input_x, ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+private:
+    ITensor *_input_x;     /**< X input (X output of the hor pass) */
+    ITensor *_input_y;     /**< Y input (Y output of the hor pass) */
+    ITensor *_output_x;    /**< X output of sobel */
+    ITensor *_output_y;    /**< Y output of sobel */
+    bool     _run_sobel_x; /**< Do we need to run sobel X? */
+    bool     _run_sobel_y; /**< Do we need to run sobel Y? */
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NESOBEL5x5KERNEL_H */
diff --git a/src/core/NEON/kernels/NESobel7x7Kernel.cpp b/src/core/NEON/kernels/NESobel7x7Kernel.cpp
index 779d67a..835b333 100644
--- a/src/core/NEON/kernels/NESobel7x7Kernel.cpp
+++ b/src/core/NEON/kernels/NESobel7x7Kernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NESobel7x7Kernel.h"
+#include "src/core/NEON/kernels/NESobel7x7Kernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NESobel7x7Kernel.h b/src/core/NEON/kernels/NESobel7x7Kernel.h
new file mode 100644
index 0000000..c5a3899
--- /dev/null
+++ b/src/core/NEON/kernels/NESobel7x7Kernel.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NESOBEL7x7KERNEL_H
+#define ARM_COMPUTE_NESOBEL7x7KERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to run the horizontal pass of 7x7 Sobel filter on a tensor.
+ *
+ */
+class NESobel7x7HorKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NESobel7x7HorKernel";
+    }
+    /** Default constructor */
+    NESobel7x7HorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel7x7HorKernel(const NESobel7x7HorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel7x7HorKernel &operator=(const NESobel7x7HorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NESobel7x7HorKernel(NESobel7x7HorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NESobel7x7HorKernel &operator=(NESobel7x7HorKernel &&) = default;
+    /** Default destructor */
+    ~NESobel7x7HorKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S32.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S32.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+private:
+    const ITensor *_input;       /**< Input tensor */
+    ITensor       *_output_x;    /**< X output of horizontal pass */
+    ITensor       *_output_y;    /**< Y output of horizontal pass */
+    bool           _run_sobel_x; /**< Do we need to run Sobel X? */
+    bool           _run_sobel_y; /**< Do we need to run Sobel Y? */
+    BorderSize     _border_size; /**< Border size */
+};
+
+/** Interface for the kernel to run the vertical pass of 7x7 Sobel Y filter on a tensor.
+ *
+*/
+class NESobel7x7VertKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NESobel7x7VertKernel";
+    }
+    /** Default constructor */
+    NESobel7x7VertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel7x7VertKernel(const NESobel7x7VertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel7x7VertKernel &operator=(const NESobel7x7VertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NESobel7x7VertKernel(NESobel7x7VertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NESobel7x7VertKernel &operator=(NESobel7x7VertKernel &&) = default;
+    /** Default destructor */
+    ~NESobel7x7VertKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @note At least one of output_x or output_y must be set
+     * @note If output_x is set then input_x must be set too
+     * @note If output_y is set then input_y must be set too
+     *
+     * @param[in]  input_x          (Optional) Input for X (X output of hor pass). Data type supported: S32.
+     * @param[in]  input_y          (Optional) Input for Y (Y output of hor pass). Data type supported: S32.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S32.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S32.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input_x, const ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+private:
+    const ITensor *_input_x;     /**< X input (X output of the hor pass) */
+    const ITensor *_input_y;     /**< Y input (Y output of the hor pass) */
+    ITensor       *_output_x;    /**< X output of sobel */
+    ITensor       *_output_y;    /**< Y output of sobel */
+    bool           _run_sobel_x; /**< Do we need to run sobel X? */
+    bool           _run_sobel_y; /**< Do we need to run sobel Y? */
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NESOBEL7x7KERNEL_H */
diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
index 13f0a54..97797ce 100644
--- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
+#include "src/core/NEON/kernels/NESoftmaxLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.h b/src/core/NEON/kernels/NESoftmaxLayerKernel.h
new file mode 100644
index 0000000..adc2e57
--- /dev/null
+++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H
+#define ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the identifying the max value of 1D Logits */
+class NELogits1DMaxKernel : public INESimpleKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NELogits1DMaxKernel";
+    }
+    /** Default constructor */
+    NELogits1DMaxKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELogits1DMaxKernel(const NELogits1DMaxKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELogits1DMaxKernel &operator=(const NELogits1DMaxKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NELogits1DMaxKernel(NELogits1DMaxKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NELogits1DMaxKernel &operator=(NELogits1DMaxKernel &&) = default;
+    /** Default destructor */
+    ~NELogits1DMaxKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out] output Destination tensor. Data types supported: same as @p input
+     */
+    void configure(const ITensor *input, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NELogits1DMaxKernel
+     *
+     * @param[in] input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] output Destination tensor. Data types supported: same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+private:
+    using Logits1DMaxFunction = void(const ITensor &in, ITensor &out, const Window &window);
+
+private:
+    Logits1DMaxFunction *_func;
+    BorderSize           _border_size;
+};
+
+/** Interface for softmax computation for QASYMM8 with pre-computed max. */
+template <bool IS_LOG = false>
+class NELogits1DSoftmaxKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        if(IS_LOG)
+        {
+            return "NELogits1DSoftmaxKernel";
+        }
+        else
+        {
+            return "NELogits1DLogSoftmaxKernel";
+        }
+    }
+    /** Default constructor */
+    NELogits1DSoftmaxKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELogits1DSoftmaxKernel(const NELogits1DSoftmaxKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELogits1DSoftmaxKernel &operator=(const NELogits1DSoftmaxKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NELogits1DSoftmaxKernel(NELogits1DSoftmaxKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NELogits1DSoftmaxKernel &operator=(NELogits1DSoftmaxKernel &&) = default;
+    /** Default destructor */
+    ~NELogits1DSoftmaxKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  max    Max values tensor. Same shape as input with dimension 0 set to 1.
+     *                    Data types supported: same as @p input.
+     * @param[out] output Destination tensor. Data types supported: same as @p input.
+     * @param[in]  beta   A scaling factor for the exponent.
+     *
+     * @param      tmp    Auxiliary tensor. Must be type F32 and same shape as the input.
+     */
+    void configure(const ITensor *input, const ITensor *max, ITensor *output, const float beta, ITensor *tmp);
+    /** Static function to check if given info will lead to a valid configuration of @ref NELogits1DSoftmaxKernel
+     *
+     * @param[in] input  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] max    Max values tensor info. Same shape as input with dimension 0 set to 1.
+     *                   Data types supported: same as @p input.
+     * @param[in] output Destination tensor info. Data types supported: same as @p input.
+     * @param[in] beta   A scaling factor for the exponent.
+     * @param[in] tmp    Tensor info of auxiliary. Must be type F32 and same shape as the input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *max,
+                           const ITensorInfo *output, const float beta, const ITensorInfo *tmp);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    using LogitsSoftmaxFunction = void(const ITensor &in, const ITensor &max, void *const tmp, ITensor &out, const float beta,
+                                       const Window &window);
+
+    LogitsSoftmaxFunction *_func;
+    const ITensor         *_input;
+    const ITensor         *_max;
+    ITensor               *_output;
+    float                  _beta;
+    ITensor               *_tmp; //Temporary. Used internally
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
index 3293466..27b3154 100644
--- a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
+#include "src/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
diff --git a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h
new file mode 100644
index 0000000..6277245
--- /dev/null
+++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NESPACETOBATCHLAYERKERNEL_H
+#define ARM_COMPUTE_NESPACETOBATCHLAYERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+// Forward declaration
+class ITensor;
+
+/** Interface for the space to batch kernel */
+class NESpaceToBatchLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NESpaceToBatchLayerKernel";
+    }
+    /** Default constructor */
+    NESpaceToBatchLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESpaceToBatchLayerKernel(const NESpaceToBatchLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESpaceToBatchLayerKernel &operator=(const NESpaceToBatchLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NESpaceToBatchLayerKernel(NESpaceToBatchLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NESpaceToBatchLayerKernel &operator=(NESpaceToBatchLayerKernel &&) = default;
+    /** Default destructor */
+    ~NESpaceToBatchLayerKernel() = default;
+    /** Initialise the kernel's inputs and output.
+     *
+     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
+     * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
+     * @param[in]  paddings    2-D tensor with shape [2, M]. Data types supported: S32
+     * @param[out] output      Tensor output. Data types supported: same as @p input
+     */
+    void configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output);
+    /** Initialise the kernel's input and output. (Static block shape and paddings)
+     *
+     * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
+     * @param[in]  block_shape_x Block shape x value.
+     * @param[in]  block_shape_y Block shape y value.
+     * @param[in]  padding_left  The left padding of the output tensor.
+     * @param[in]  padding_right The right padding of the output tensor.
+     * @param[out] output        Tensor output. Data types supported: same as @p input
+     */
+    void configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayerKernel
+     *
+     * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
+     * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32
+     * @param[in] paddings    2-D tensor with shape [2, M]. Data types supported: S32
+     * @param[in] output      Tensor output. Data types supported: same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayerKernel (Static block shape and paddings)
+     *
+     * @param[in] input         Tensor input. Supported tensor rank: 4. Data types supported: All.
+     * @param[in] block_shape_x Block shape x value.
+     * @param[in] block_shape_y Block shape y value.
+     * @param[in] padding_left  The left padding of the output tensor.
+     * @param[in] padding_right The right padding of the output tensor.
+     * @param[in] output        Tensor output. Data types supported: same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input;       /**< Source tensor */
+    const ITensor *_block_shape; /**< Block shape tensor */
+    const ITensor *_paddings;    /**< Paddings tensor */
+    ITensor       *_output;      /**< Destination tensor */
+    DataLayout     _data_layout; /**< Data layout to be used at run-time */
+
+    Size2D _padding_left;
+    int    _block_shape_x;
+    int    _block_shape_y;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NESPACETOBATCHLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
index 7c9cc49..7687c50 100644
--- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h"
+#include "src/core/NEON/kernels/NESpaceToDepthLayerKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h
new file mode 100644
index 0000000..953b68a
--- /dev/null
+++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NESPACETODEPTHLAYERKERNEL_H
+#define ARM_COMPUTE_NESPACETODEPTHLAYERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the space to depth kernel */
+class NESpaceToDepthLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NESpaceToDepthLayerKernel";
+    }
+    /** Default constructor */
+    NESpaceToDepthLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESpaceToDepthLayerKernel(const NESpaceToDepthLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESpaceToDepthLayerKernel &operator=(const NESpaceToDepthLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NESpaceToDepthLayerKernel(NESpaceToDepthLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NESpaceToDepthLayerKernel &operator=(NESpaceToDepthLayerKernel &&) = default;
+    /** Default destructor */
+    ~NESpaceToDepthLayerKernel() = default;
+    /** Initialise the kernel's inputs and output.
+     *
+     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
+     * @param[out] output      Tensor output. Data types supported: same as @p input
+     * @param[in]  block_shape Block shape value
+     */
+    void configure(const ITensor *input, ITensor *output, int32_t block_shape);
+    /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToDepthLayerKernel
+     *
+     * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
+     * @param[in] output      Tensor output info. Data types supported: same as @p input
+     * @param[in] block_shape Block shape value
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input;       /**< Source tensor */
+    ITensor       *_output;      /**< Destination tensor */
+    int32_t        _block_shape; /**< Block shape */
+    DataLayout     _data_layout; /**< Data layout  of the operation */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NESPACETODEPTHLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEStackLayerKernel.cpp b/src/core/NEON/kernels/NEStackLayerKernel.cpp
index ad7f1b1..55170a1 100644
--- a/src/core/NEON/kernels/NEStackLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEStackLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEStackLayerKernel.h"
+#include "src/core/NEON/kernels/NEStackLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEStackLayerKernel.h b/src/core/NEON/kernels/NEStackLayerKernel.h
new file mode 100644
index 0000000..9b0a039
--- /dev/null
+++ b/src/core/NEON/kernels/NEStackLayerKernel.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_NESTACKLAYERKERNEL_H
+#define ARM_COMPUTE_NESTACKLAYERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to stacks a rank-R tensor into one with rank-(R+1) along the axis dimension.*/
+class NEStackLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEStackLayerKernel";
+    }
+    /** Default constructor */
+    NEStackLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEStackLayerKernel(const NEStackLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEStackLayerKernel &operator=(const NEStackLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEStackLayerKernel(NEStackLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEStackLayerKernel &operator=(NEStackLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEStackLayerKernel() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * @note Supported input tensor rank: up to 4
+     *
+     * @param[in]  input       Input tensor. Data types supported: All
+     * @param[in]  axis        The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
+     * @param[in]  idx_input   Index of the input tensor in the list of tensors to stack.
+     *                         All tensors in the list must have the same shape
+     * @param[in]  num_tensors Number of tensors to stack
+     * @param[out] output      Output tensor. Data types supported: Same as @p input.
+     *
+     */
+    void configure(const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output);
+    /**  Static function to check if given info will lead to a valid configuration of @ref NEStackLayerKernel
+     *
+     * @note Supported input tensor rank: up to 4
+     *
+     * @param[in] input       Input tensor info. Data types supported: All
+     * @param[in] axis        The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
+     * @param[in] idx_input   Index of the input tensor in the list of tensors to stack
+     *                        All tensors in the list must have the same shape
+     * @param[in] num_tensors Number of tensors to stack
+     * @param[in] output      Output tensor info. Data types supported: Same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output);
+
+    // Inherited methods overridden
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input;
+    ITensor       *_output;
+    unsigned int   _axis;
+    unsigned int   _idx_input;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NESTACKLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEStridedSliceKernel.cpp b/src/core/NEON/kernels/NEStridedSliceKernel.cpp
index 13b2cb5..ac04a10 100644
--- a/src/core/NEON/kernels/NEStridedSliceKernel.cpp
+++ b/src/core/NEON/kernels/NEStridedSliceKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEStridedSliceKernel.h"
+#include "src/core/NEON/kernels/NEStridedSliceKernel.h"
 
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
diff --git a/src/core/NEON/kernels/NEStridedSliceKernel.h b/src/core/NEON/kernels/NEStridedSliceKernel.h
new file mode 100644
index 0000000..9ce5174
--- /dev/null
+++ b/src/core/NEON/kernels/NEStridedSliceKernel.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NE_STRIDED_SLICE_KERNEL_H
+#define ARM_COMPUTE_NE_STRIDED_SLICE_KERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Interface for the kernel to perform tensor strided slicing */
+class NEStridedSliceKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEStridedSliceKernel";
+    }
+    /** Default constructor */
+    NEStridedSliceKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEStridedSliceKernel(const NEStridedSliceKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEStridedSliceKernel &operator=(const NEStridedSliceKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEStridedSliceKernel(NEStridedSliceKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEStridedSliceKernel &operator=(NEStridedSliceKernel &&) = default;
+    /** Default destructor */
+    ~NEStridedSliceKernel() = default;
+    /** Configure kernel
+     *
+     * @note Supported tensor rank: up to 4
+     *
+     * @param[in]  input            Source tensor info. Data type supported: All
+     * @param[out] output           Destination tensor info. Data type supported: Same as @p input
+     * @param[in]  starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in]  ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in]  strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in]  begin_mask       If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in]  end_mask         If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in]  shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
+     *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
+     */
+    void configure(const ITensorInfo *input, ITensorInfo *output,
+                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                   int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEStridedSliceKernel
+     *
+     * @note Supported tensor rank: up to 4
+     *
+     * @param[in] input            Source tensor info. Data type supported: All
+     * @param[in] output           Destination tensor info. Data type supported: Same as @p input
+     * @param[in] starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] begin_mask       If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in] end_mask         If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
+     *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                           int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+
+private:
+    Coordinates _starts_abs;    /**< Absolute start coordinates */
+    Coordinates _final_strides; /**< Final strides */
+    int32_t     _shrink_mask;   /**< Shrink axis mask */
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NE_STRIDED_SLICE_KERNEL_H */
diff --git a/src/core/NEON/kernels/NETableLookupKernel.cpp b/src/core/NEON/kernels/NETableLookupKernel.cpp
index d26a0ee..19ce7f0 100644
--- a/src/core/NEON/kernels/NETableLookupKernel.cpp
+++ b/src/core/NEON/kernels/NETableLookupKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NETableLookupKernel.h"
+#include "src/core/NEON/kernels/NETableLookupKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NETableLookupKernel.h b/src/core/NEON/kernels/NETableLookupKernel.h
new file mode 100644
index 0000000..7937999
--- /dev/null
+++ b/src/core/NEON/kernels/NETableLookupKernel.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NETABLELOOKUPKERNEL_H
+#define ARM_COMPUTE_NETABLELOOKUPKERNEL_H
+
+#include "src/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+class ILut;
+
+/** Interface for the kernel to perform table lookup calculations. */
+class NETableLookupKernel : public INESimpleKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NETableLookupKernel";
+    }
+    /** Default constructor */
+    NETableLookupKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NETableLookupKernel(const NETableLookupKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NETableLookupKernel &operator=(const NETableLookupKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NETableLookupKernel(NETableLookupKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NETableLookupKernel &operator=(NETableLookupKernel &&) = default;
+    /** Default destructor */
+    ~NETableLookupKernel() = default;
+    /** Initialise the kernel's input, lut and output.
+     *
+     * @param[in]  input  An input tensor. Data types supported: U8/S16.
+     * @param[in]  lut    The input LUT.
+     * @param[out] output The output tensor. Data types supported: same as @p input
+     */
+    void configure(const ITensor *input, const ILut *lut, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Perform table lookup on a given window.
+     *
+     * @param window window Region on which to execute the kernel.
+     */
+    template <class T>
+    void tableLookup(const Window &window);
+    /** Common signature for all the specialised lut functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using TableLookupFunction = void (NETableLookupKernel::*)(const Window &window);
+    /** Sub function to use for the particular tensor types passed to configure() */
+    TableLookupFunction _func;
+    const ILut         *_lut;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NETABLELOOKUPKERNEL_H */
diff --git a/src/core/NEON/kernels/NEThresholdKernel.cpp b/src/core/NEON/kernels/NEThresholdKernel.cpp
index aad440b..183bb8d 100644
--- a/src/core/NEON/kernels/NEThresholdKernel.cpp
+++ b/src/core/NEON/kernels/NEThresholdKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEThresholdKernel.h"
+#include "src/core/NEON/kernels/NEThresholdKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEThresholdKernel.h b/src/core/NEON/kernels/NEThresholdKernel.h
new file mode 100644
index 0000000..6b3b386
--- /dev/null
+++ b/src/core/NEON/kernels/NEThresholdKernel.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NETHRESHOLDKERNEL_H
+#define ARM_COMPUTE_NETHRESHOLDKERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the thresholding kernel */
+class NEThresholdKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEThresholdKernel";
+    }
+    /** Default constructor */
+    NEThresholdKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEThresholdKernel(const NEThresholdKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEThresholdKernel &operator=(const NEThresholdKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEThresholdKernel(NEThresholdKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEThresholdKernel &operator=(NEThresholdKernel &&) = default;
+    /** Default destructor */
+    ~NEThresholdKernel() = default;
+    /** Initialise the kernel's input, output and threshold parameters.
+     *
+     * @param[in]  input  An input tensor. Data type supported: U8
+     * @param[out] output The output tensor. Data type supported: U8.
+     * @param[in]  info   Threshold kernel descriptor
+     */
+    void configure(const ITensor *input, ITensor *output, const ThresholdKernelInfo &info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEThresholdKernel
+     *
+     * @param[in] input  Input tensor info. Data type supported: U8
+     * @param[in] output Output tensor info. Data type supported: U8
+     * @param[in] info   Threshold kernel descriptor
+     *
+     * @return A status containing an error code in case of failure
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ThresholdKernelInfo &info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** run binary thresholding on the given window */
+    void run_binary(const Window &window);
+    /** run range thresholding on the given window */
+    void run_range(const Window &window);
+
+    void (NEThresholdKernel::*_func)(const Window &window);
+
+    const ITensor      *_input;  /**< Input */
+    ITensor            *_output; /**< Output */
+    ThresholdKernelInfo _info;   /**< Threshold descriptor */
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NETHRESHOLDKERNEL_H */
diff --git a/src/core/NEON/kernels/NETileKernel.cpp b/src/core/NEON/kernels/NETileKernel.cpp
index 99651c8..94256dc 100644
--- a/src/core/NEON/kernels/NETileKernel.cpp
+++ b/src/core/NEON/kernels/NETileKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NETileKernel.h"
+#include "src/core/NEON/kernels/NETileKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NETileKernel.h b/src/core/NEON/kernels/NETileKernel.h
new file mode 100644
index 0000000..8dfea8b
--- /dev/null
+++ b/src/core/NEON/kernels/NETileKernel.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NETILEKERNEL_H
+#define ARM_COMPUTE_NETILEKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform a tile operation */
+class NETileKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NETileKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    NETileKernel(const NETileKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    NETileKernel &operator=(const NETileKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NETileKernel(NETileKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NETileKernel &operator=(NETileKernel &&) = default;
+    /** Default destructor */
+    ~NETileKernel() = default;
+    const char *name() const override
+    {
+        return "NETileKernel";
+    }
+    /** Set the source, destination of the kernel
+     *
+     * @param[in]  input     Source tensor. Data type supported: All.
+     * @param[out] output    Destination tensor. Same as @p input
+     * @param[in]  multiples Contains the number of times the input tensor should be replicated on the given dimension.
+     */
+    void configure(const ITensor *input, ITensor *output, const Multiples &multiples);
+    /** Static function to check if given info will lead to a valid configuration of @ref NETileKernel
+     *
+     * @param[in] input     Source tensor info. Data type supported: All.
+     * @param[in] output    Destination tensor info. Same as @p input
+     * @param[in] multiples Contains the number of times the input tensor should be replicated on the given dimension.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Multiples &multiples);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input;
+    ITensor       *_output;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NETILEKERNEL_H */
diff --git a/src/core/NEON/kernels/NETransposeKernel.cpp b/src/core/NEON/kernels/NETransposeKernel.cpp
index 6037810..134831b 100644
--- a/src/core/NEON/kernels/NETransposeKernel.cpp
+++ b/src/core/NEON/kernels/NETransposeKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
+#include "src/core/NEON/kernels/NETransposeKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NETransposeKernel.h b/src/core/NEON/kernels/NETransposeKernel.h
new file mode 100644
index 0000000..73d2098
--- /dev/null
+++ b/src/core/NEON/kernels/NETransposeKernel.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NETRANSPOSEKERNEL_H
+#define ARM_COMPUTE_NETRANSPOSEKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel which transposes the elements of a matrix.
+ *
+ * [width, height, batch] -> [height, width, batch]
+ *
+ */
+class NETransposeKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NETransposeKernel";
+    }
+    /** Default constructor */
+    NETransposeKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NETransposeKernel(const NETransposeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NETransposeKernel &operator=(const NETransposeKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NETransposeKernel(NETransposeKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NETransposeKernel &operator=(NETransposeKernel &&) = default;
+    /** Default destructor */
+    ~NETransposeKernel() = default;
+
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor. Data types supported: All
+     * @param[out] output Output tensor. Data type supported: Same as @p input
+     */
+    void configure(const ITensor *input, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NETransposeKernel
+     *
+     * @param[in] input  Input tensor. Data types supported: All
+     * @param[in] output Output tensor. Data type supported: Same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Common signature for all the transpose functions
+     *
+     * @param[in]  input  An input tensor. Data types supported: All
+     * @param[out] output The output tensor. Data type supported: same as @p input
+     * @param[in]  window Region on which to execute the kernel.
+     */
+    using TransposeFunction = void(const ITensor *input, ITensor *output, const Window &window);
+    /** Transpose function to use for the particular tensor types passed to configure() */
+    TransposeFunction *_func;
+    const ITensor     *_input;
+    ITensor           *_output;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NETRANSPOSEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp b/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
index 129c83c..cbdec50 100644
--- a/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h"
+#include "src/core/NEON/kernels/NEUpsampleLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEUpsampleLayerKernel.h b/src/core/NEON/kernels/NEUpsampleLayerKernel.h
new file mode 100644
index 0000000..7ff797a
--- /dev/null
+++ b/src/core/NEON/kernels/NEUpsampleLayerKernel.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEUPSAMPLELAYERKERNEL_H
+#define ARM_COMPUTE_NEUPSAMPLELAYERKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the Upsample layer kernel.*/
+class NEUpsampleLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEUpsampleLayerKernel";
+    }
+    /** Default constructor */
+    NEUpsampleLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEUpsampleLayerKernel(const NEUpsampleLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEUpsampleLayerKernel &operator=(const NEUpsampleLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    NEUpsampleLayerKernel(NEUpsampleLayerKernel &&) = default;
+    /** Default move assignment operator */
+    NEUpsampleLayerKernel &operator=(NEUpsampleLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEUpsampleLayerKernel() = default;
+    /** Set the input output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out] output Destination tensor. Data types supported: same as @p input.
+     * @param[in]  info   Contains stride information described in @ref Size2D.
+     * @param[in]  policy Defines the policy to fill the intermediate pixels.
+     *
+     */
+    void configure(const ITensor *input, ITensor *output, const Size2D &info, const InterpolationPolicy policy);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEUpsampleLayerKernel
+     *
+     * @param[in] input  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] output Destination tensor info. Data types supported: same as @p input.
+     * @param[in] info   Contains stride information described in @ref Size2D.
+     * @param[in] policy Defines the policy to fill the intermediate pixels.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &info, const InterpolationPolicy policy);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Function to run upsample layer (NCHW)
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename T, int S>
+    void upsample_nchw(const Window &window);
+    /** Function to run upsample layer (NHWC)
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename T, int S>
+    void upsample_nhwc(const Window &window);
+
+    using UpsampleFunctionPtr = void (NEUpsampleLayerKernel::*)(const Window &window);
+
+private:
+    UpsampleFunctionPtr _func;
+    const ITensor      *_input;
+    ITensor            *_output;
+    Size2D              _info;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEUPSAMPLELAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEWarpKernel.cpp b/src/core/NEON/kernels/NEWarpKernel.cpp
index 891304f..1ae0761 100644
--- a/src/core/NEON/kernels/NEWarpKernel.cpp
+++ b/src/core/NEON/kernels/NEWarpKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
+#include "src/core/NEON/kernels/NEWarpKernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
diff --git a/src/core/NEON/kernels/NEWarpKernel.h b/src/core/NEON/kernels/NEWarpKernel.h
new file mode 100644
index 0000000..2c4cb55
--- /dev/null
+++ b/src/core/NEON/kernels/NEWarpKernel.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEWARPKERNEL_H
+#define ARM_COMPUTE_NEWARPKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+#include <array>
+#include <cstdint>
+namespace arm_compute
+{
+class ITensor;
+
+/** Common interface for warp affine and warp perspective */
+class INEWarpKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    INEWarpKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INEWarpKernel(const INEWarpKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INEWarpKernel &operator=(const INEWarpKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    INEWarpKernel(INEWarpKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    INEWarpKernel &operator=(INEWarpKernel &&) = default;
+    /** Default destructor */
+    ~INEWarpKernel() = default;
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input                 Source tensor. Data type supported: U8.
+     * @param[out] output                Destination tensor. Data type supported: U8.
+     * @param[in]  matrix                The perspective or affine matrix to use. Must be 2x3 for affine and 3x3 for perspective of type float.
+     *                                   The matrix argument requires 9 values, for the affine case the last 3 values are ignored.
+     * @param[in]  border_mode           Strategy to use for borders
+     * @param[in]  constant_border_value Constant value used for filling the border.
+     */
+    virtual void configure(const ITensor *input, ITensor *output, const std::array<float, 9> &matrix, BorderMode border_mode, uint8_t constant_border_value);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+
+protected:
+    /** function to perform warp affine or warp perspective on the given window when border mode == UNDEFINED
+     *
+     * @param[in] window Region on which to execute the kernel
+     */
+    virtual void warp_undefined(const Window &window) = 0;
+    /** function to perform warp affine or warp perspective on the given window when border mode == CONSTANT
+     *
+     * @param[in] window Region on which to execute the kernel
+     */
+    virtual void warp_constant(const Window &window) = 0;
+    /** function to perform warp affine or warp perspective on the given window when border mode == REPLICATE
+     *
+     * @param[in] window Region on which to execute the kernel
+     */
+    virtual void warp_replicate(const Window &window) = 0;
+    /** Common signature for all the specialised warp functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    void (INEWarpKernel::*_func)(const Window &window);
+
+    const ITensor *_input;                 /**< Input Tensor */
+    ITensor       *_output;                /**< Output Tensor */
+    uint8_t        _constant_border_value; /**< Constant value used for filling the border. This value is used for those pixels out of the ROI when the border mode is CONSTANT */
+    std::array<float, 9> _matrix;          /**< The affine or perspective matrix. Must be 2x3 for warp affine or 3x3 for warp perspective of type float. */
+};
+
+/** Template interface for the kernel to compute warp affine
+ *
+ */
+template <InterpolationPolicy interpolation>
+class NEWarpAffineKernel : public INEWarpKernel
+{
+private:
+    const char *name() const override
+    {
+        return "NEWarpAffineKernel";
+    }
+    // Inherited methods overridden:
+    void warp_undefined(const Window &window) override;
+    void warp_constant(const Window &window) override;
+    void warp_replicate(const Window &window) override;
+};
+
+/** Template interface for the kernel to compute warp perspective
+ *
+ */
+template <InterpolationPolicy interpolation>
+class NEWarpPerspectiveKernel : public INEWarpKernel
+{
+private:
+    const char *name() const override
+    {
+        return "NEWarpPerspectiveKernel";
+    }
+    // Inherited methods overridden:
+    void warp_undefined(const Window &window) override;
+    void warp_constant(const Window &window) override;
+    void warp_replicate(const Window &window) override;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEWARPKERNEL_H */
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index c7fa2d2..118655b 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
+#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.h b/src/core/NEON/kernels/NEWeightsReshapeKernel.h
new file mode 100644
index 0000000..9678b79
--- /dev/null
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H
+#define ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform reshaping on the weights used by convolution and locally connected layer
+ *
+ * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
+ * In combination with the @ref NEIm2ColKernel can transform a convolution to a matrix multiplication.
+ *
+ * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have:
+ * @f[
+ * \left( \begin{array}{ccc}
+ * a000 & a001 & a002 \\
+ * a010 & a011 & a012 \\
+ * a020 & a021 & a022 \\
+ * \end{array} \right)
+ * \left( \begin{array}{ccc}
+ * a100 & a101 & a102 \\
+ * a110 & a111 & a112 \\
+ * a120 & a121 & a122 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccc}
+ * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\
+ * \end{array} \right)
+ * @f]
+ */
+class NEWeightsReshapeKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEWeightsReshapeKernel";
+    }
+    /** Constructor.*/
+    NEWeightsReshapeKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEWeightsReshapeKernel(const NEWeightsReshapeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEWeightsReshapeKernel &operator=(const NEWeightsReshapeKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEWeightsReshapeKernel(NEWeightsReshapeKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEWeightsReshapeKernel &operator=(NEWeightsReshapeKernel &&) = default;
+    /** Default destructor */
+    ~NEWeightsReshapeKernel() = default;
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
+     *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared.
+     *                    Data types supported: All
+     * @param[in]  bias   The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
+     *                    dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
+     *                    @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
+     * @param[out] output The output tensor. Data types supported: Same as @p input
+     */
+    void configure(const ITensor *input, const ITensor *bias, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEWeightsReshapeKernel
+     *
+     * @param[in] input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
+     *                   and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared.
+     *                   Data types supported: All
+     * @param[in] biases The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
+     *                   dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
+     *                   @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
+     * @param[in] output The output tensor. Should be a 2D Tensor. Data types supported: Same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input;
+    const ITensor *_bias;
+    ITensor       *_output;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp
index 90afbd6..b5afeed 100644
--- a/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEWidthConcatenateLayerKernel.h"
+#include "src/core/NEON/kernels/NEWidthConcatenateLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.h b/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.h
new file mode 100644
index 0000000..81b4cbe
--- /dev/null
+++ b/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_NEWIDTHCONCATENATELAYERKERNEL_H
+#define ARM_COMPUTE_NEWIDTHCONCATENATELAYERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Interface for the width concatenate kernel.
+ *  The input tensor will be concatenated into the output tensor.
+ */
+class NEWidthConcatenateLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEWidthConcatenateLayerKernel";
+    }
+    /** Default constructor */
+    NEWidthConcatenateLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEWidthConcatenateLayerKernel(const NEWidthConcatenateLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEWidthConcatenateLayerKernel &operator=(const NEWidthConcatenateLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEWidthConcatenateLayerKernel(NEWidthConcatenateLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEWidthConcatenateLayerKernel &operator=(NEWidthConcatenateLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEWidthConcatenateLayerKernel() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]     input        Input tensor info. Data types supported: All
+     * @param[in]     width_offset The offset on the X axis.
+     * @param[in,out] output       Output tensor info. Data types supported: Same as @p input.
+     */
+    void configure(const ITensorInfo *input, unsigned int width_offset, ITensorInfo *output);
+    /**  Static function to check if given info will lead to a valid configuration of @ref NEWidthConcatenateLayerKernel
+     *
+     * @param[in] input        Input tensor info. Data types supported: All
+     * @param[in] width_offset The offset on the X axis.
+     * @param[in] output       Output tensor info. Data types supported: Same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+
+private:
+    unsigned int _width_offset;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEWIDTHCONCATENATELAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
index bf5d77f..2b87e51 100644
--- a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
+++ b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEGEMMWINOGRADCONVOLUTIONLAYERKERNEL_H
 #define ARM_COMPUTE_NEGEMMWINOGRADCONVOLUTIONLAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 #include "src/core/NEON/kernels/convolution/common/convolution.hpp"
 #include "src/core/NEON/kernels/convolution/common/tensor.hpp"
 
diff --git a/src/core/NEON/kernels/NEYOLOLayerKernel.cpp b/src/core/NEON/kernels/NEYOLOLayerKernel.cpp
index 48c0616..33bcc20 100644
--- a/src/core/NEON/kernels/NEYOLOLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEYOLOLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h"
+#include "src/core/NEON/kernels/NEYOLOLayerKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
diff --git a/src/core/NEON/kernels/NEYOLOLayerKernel.h b/src/core/NEON/kernels/NEYOLOLayerKernel.h
new file mode 100644
index 0000000..806cf9c
--- /dev/null
+++ b/src/core/NEON/kernels/NEYOLOLayerKernel.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEYOLOLAYERKERNEL_H
+#define ARM_COMPUTE_NEYOLOLAYERKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the YOLO layer kernel. */
+class NEYOLOLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEYOLOLayerKernel";
+    }
+    /** Constructor */
+    NEYOLOLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEYOLOLayerKernel(const NEYOLOLayerKernel &) = delete;
+    /** Default move constructor */
+    NEYOLOLayerKernel(NEYOLOLayerKernel &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEYOLOLayerKernel &operator=(const NEYOLOLayerKernel &) = delete;
+    /** Default move assignment operator */
+    NEYOLOLayerKernel &operator=(NEYOLOLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEYOLOLayerKernel() = default;
+    /** Set the input and output tensor.
+     *
+     * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place
+     *
+     * @param[in, out] input       Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
+     *                             of the activation function. Data types supported: F16/F32.
+     * @param[out]     output      Destination tensor. Data type supported: same as @p input
+     * @param[in]      act_info    Activation layer parameters.
+     * @param[in]      num_classes Number of classes to activate (must be submultiple of @p input channels)
+     */
+    void configure(ITensor *input, ITensor *output, const ActivationLayerInfo &act_info, int32_t num_classes);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEYOLOLayerKernel
+     *
+     * @param[in] input       Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
+     *                        of the activation function. Data types supported: F16/F32.
+     * @param[in] output      Destination tensor info. Data type supported: same as @p input
+     * @param[in] act_info    Activation layer information.
+     * @param[in] num_classes Number of classes to activate (must be submultiple of @p input channels)
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Function to run YOLO layer
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    template <typename T, int S>
+    void yolo_layer_nchw(const Window &window);
+    /** Function to run YOLO layer on tensors with NHWC format
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    template <typename T>
+    void yolo_layer_nhwc(const Window &window);
+    /** Common signature for all the yolo layer functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using YOLOFunctionPtr = void (NEYOLOLayerKernel::*)(const Window &window);
+
+private:
+    YOLOFunctionPtr     _func;
+    ITensor            *_input;
+    ITensor            *_output;
+    ActivationLayerInfo _act_info;
+    int32_t             _num_classes;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEYOLOLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h b/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h
index 030f1aa..92c0132 100644
--- a/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h
+++ b/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h
@@ -24,7 +24,7 @@
 #ifndef SRC_INEGEMMWRAPPERKERNEL_H
 #define SRC_INEGEMMWRAPPERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h b/src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
index a2f7e3b..a956898 100644
--- a/src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
+++ b/src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
@@ -24,9 +24,9 @@
 #ifndef SRC_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H
 #define SRC_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/INEKernel.h"
 
 #include "src/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
 
diff --git a/src/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h b/src/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h
index 4af82f8..7fcf2b1 100644
--- a/src/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h
+++ b/src/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H
 #define ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_gemm_compute_iface.hpp"
+#include "src/core/NEON/INEKernel.h"
 
 #include "gemm_common.hpp"
 
diff --git a/src/core/TracePoint.cpp b/src/core/TracePoint.cpp
index 06d9527..d67faad 100644
--- a/src/core/TracePoint.cpp
+++ b/src/core/TracePoint.cpp
@@ -33,12 +33,12 @@
 #include "arm_compute/core/IPyramid.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
 #include "arm_compute/runtime/IWeightsManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "src/core/NEON/kernels/assembly/arm_gemm.hpp"
 #include "utils/TypePrinter.h"
 
 #include <array>
diff --git a/src/graph/backends/NEON/NEFunctionFactory.cpp b/src/graph/backends/NEON/NEFunctionFactory.cpp
index 95c6631..ec06f3f 100644
--- a/src/graph/backends/NEON/NEFunctionFactory.cpp
+++ b/src/graph/backends/NEON/NEFunctionFactory.cpp
@@ -32,6 +32,7 @@
 #include "arm_compute/graph/nodes/Nodes.h"
 #include "arm_compute/runtime/CPP/CPPFunctions.h"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
+#include "src/core/NEON/NEKernels.h"
 #include "support/Cast.h"
 #include "support/ToolchainSupport.h"
 
diff --git a/src/graph/backends/NEON/NENodeValidator.cpp b/src/graph/backends/NEON/NENodeValidator.cpp
index 63e8ff9..a9e5a86 100644
--- a/src/graph/backends/NEON/NENodeValidator.cpp
+++ b/src/graph/backends/NEON/NENodeValidator.cpp
@@ -28,6 +28,19 @@
 
 #include "arm_compute/runtime/CPP/CPPFunctions.h"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
+#include "src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
+#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
+#include "src/core/NEON/kernels/NEReshapeLayerKernel.h"
+#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
 #include "support/Cast.h"
 
 using namespace arm_compute::utils::cast;
diff --git a/src/runtime/NEON/INEOperator.cpp b/src/runtime/NEON/INEOperator.cpp
index 75068b1..a13b29b 100644
--- a/src/runtime/NEON/INEOperator.cpp
+++ b/src/runtime/NEON/INEOperator.cpp
@@ -22,12 +22,16 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/INEOperator.h"
+#include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
 namespace experimental
 {
+INEOperator::~INEOperator() = default;
+
 INEOperator::INEOperator(IRuntimeContext *ctx)
     : _kernel(), _ctx(ctx), _workspace()
 {
diff --git a/src/runtime/NEON/INESimpleFunction.cpp b/src/runtime/NEON/INESimpleFunction.cpp
index cef2762..5438bce 100644
--- a/src/runtime/NEON/INESimpleFunction.cpp
+++ b/src/runtime/NEON/INESimpleFunction.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,9 +23,14 @@
  */
 #include "arm_compute/runtime/NEON/INESimpleFunction.h"
 
+#include "arm_compute/core/CPP/ICPPKernel.h"
+#include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+INESimpleFunction::~INESimpleFunction() = default;
 
 INESimpleFunction::INESimpleFunction() // NOLINT
     : _kernel(),
@@ -35,6 +40,7 @@
 
 void INESimpleFunction::run()
 {
-    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+    NEScheduler::get().schedule(_border_handler.get(), Window::DimZ);
     NEScheduler::get().schedule(_kernel.get(), Window::DimY);
 }
+} //namespace arm_compute
diff --git a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
index f2181e0..21dd58e 100644
--- a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
+++ b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
@@ -23,11 +23,15 @@
  */
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
+#include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/INEKernel.h"
 #include "src/runtime/Utils.h"
 
 namespace arm_compute
 {
+INESimpleFunctionNoBorder::~INESimpleFunctionNoBorder() = default;
+
 INESimpleFunctionNoBorder::INESimpleFunctionNoBorder(IRuntimeContext *ctx)
     : _kernel(),
       _ctx(ctx)
diff --git a/src/runtime/NEON/functions/NEAbsoluteDifference.cpp b/src/runtime/NEON/functions/NEAbsoluteDifference.cpp
index ec27820..df2bc7d 100644
--- a/src/runtime/NEON/functions/NEAbsoluteDifference.cpp
+++ b/src/runtime/NEON/functions/NEAbsoluteDifference.cpp
@@ -23,12 +23,14 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h"
 
-#include "arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h"
+#include "src/core/NEON/kernels/NEAbsoluteDifferenceKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEAbsoluteDifference::~NEAbsoluteDifference() = default;
 
 void NEAbsoluteDifference::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
 {
@@ -36,3 +38,4 @@
     k->configure(input1, input2, output);
     _kernel = std::move(k);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEAccumulate.cpp b/src/runtime/NEON/functions/NEAccumulate.cpp
index 662f8cc..20eefd9 100644
--- a/src/runtime/NEON/functions/NEAccumulate.cpp
+++ b/src/runtime/NEON/functions/NEAccumulate.cpp
@@ -23,12 +23,14 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEAccumulate.h"
 
-#include "arm_compute/core/NEON/kernels/NEAccumulateKernel.h"
+#include "src/core/NEON/kernels/NEAccumulateKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEAccumulate::~NEAccumulate() = default;
 
 void NEAccumulate::configure(const ITensor *input, ITensor *output)
 {
@@ -37,6 +39,8 @@
     _kernel = std::move(k);
 }
 
+NEAccumulateWeighted::~NEAccumulateWeighted() = default;
+
 void NEAccumulateWeighted::configure(const ITensor *input, float alpha, ITensor *output, bool use_fp16)
 {
     if(use_fp16)
@@ -53,9 +57,12 @@
     }
 }
 
+NEAccumulateSquared::~NEAccumulateSquared() = default;
+
 void NEAccumulateSquared::configure(const ITensor *input, uint32_t shift, ITensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEAccumulateSquaredKernel>();
     k->configure(input, shift, output);
     _kernel = std::move(k);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp
index 7f55edb..f9ad298 100644
--- a/src/runtime/NEON/functions/NEActivationLayer.cpp
+++ b/src/runtime/NEON/functions/NEActivationLayer.cpp
@@ -24,16 +24,18 @@
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/IRuntimeContext.h"
 #include "arm_compute/runtime/Tensor.h"
+#include "src/core/NEON/kernels/NEActivationLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 namespace experimental
 {
+NEActivationLayer::~NEActivationLayer() = default;
+
 void NEActivationLayer::configure(const ITensorInfo *input, ITensorInfo *output, const ActivationLayerInfo &activation_info)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEActivationLayerKernel>();
diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
index 70bbba6..2a9bb76 100644
--- a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
+++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
@@ -29,11 +29,14 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
 
 #include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NEArgMinMaxLayer::~NEArgMinMaxLayer() = default;
+
 NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _reduction_function(support::cpp14::make_unique<NEReductionOperation>())
 {
diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
index 4453a01..0bf9a09 100644
--- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
+#include "src/core/NEON/kernels/NEArithmeticAdditionKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -33,6 +33,8 @@
 {
 namespace experimental
 {
+NEArithmeticAddition::~NEArithmeticAddition() = default;
+
 void NEArithmeticAddition::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
index 1c95bbf..ba3f426 100644
--- a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
+#include "src/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
index eab40ac..d0fdfcf 100644
--- a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
@@ -29,10 +29,13 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
 
 #include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEBatchNormalizationLayer::~NEBatchNormalizationLayer() = default;
 
 NEBatchNormalizationLayer::NEBatchNormalizationLayer()
     : _norm_kernel()
@@ -43,7 +46,8 @@
                                           ActivationLayerInfo act_info)
 {
     // Configure kernel
-    _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon, act_info);
+    _norm_kernel = arm_compute::support::cpp14::make_unique<NEBatchNormalizationLayerKernel>();
+    _norm_kernel->configure(input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
 Status NEBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, const ITensorInfo *beta, const ITensorInfo *gamma,
@@ -55,5 +59,6 @@
 
 void NEBatchNormalizationLayer::run()
 {
-    NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+    NEScheduler::get().schedule(_norm_kernel.get(), Window::DimY);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
index 2705cff..77a63c0 100644
--- a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h"
 
 #include "support/MemorySupport.h"
 
diff --git a/src/runtime/NEON/functions/NEBitwiseAnd.cpp b/src/runtime/NEON/functions/NEBitwiseAnd.cpp
index 1d89308..f3b5220 100644
--- a/src/runtime/NEON/functions/NEBitwiseAnd.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseAnd.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseAnd.h"
 
-#include "arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h"
+#include "src/core/NEON/kernels/NEBitwiseAndKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEBitwiseNot.cpp b/src/runtime/NEON/functions/NEBitwiseNot.cpp
index 585b059..036584e 100644
--- a/src/runtime/NEON/functions/NEBitwiseNot.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseNot.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseNot.h"
 
-#include "arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h"
+#include "src/core/NEON/kernels/NEBitwiseNotKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEBitwiseOr.cpp b/src/runtime/NEON/functions/NEBitwiseOr.cpp
index bba866d..fc905a0 100644
--- a/src/runtime/NEON/functions/NEBitwiseOr.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseOr.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseOr.h"
 
-#include "arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h"
+#include "src/core/NEON/kernels/NEBitwiseOrKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEBitwiseXor.cpp b/src/runtime/NEON/functions/NEBitwiseXor.cpp
index 188fe3d..301a0c4 100644
--- a/src/runtime/NEON/functions/NEBitwiseXor.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseXor.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseXor.h"
 
-#include "arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h"
+#include "src/core/NEON/kernels/NEBitwiseXorKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
index b1ecfaf..0b63943 100644
--- a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
+++ b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h"
+#include "src/core/NEON/kernels/NEBoundingBoxTransformKernel.h"
 
 #include "support/MemorySupport.h"
 
diff --git a/src/runtime/NEON/functions/NEBox3x3.cpp b/src/runtime/NEON/functions/NEBox3x3.cpp
index a380377..01d2356 100644
--- a/src/runtime/NEON/functions/NEBox3x3.cpp
+++ b/src/runtime/NEON/functions/NEBox3x3.cpp
@@ -23,14 +23,15 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBox3x3.h"
 
-#include "arm_compute/core/NEON/kernels/NEBox3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/NEON/kernels/NEBox3x3Kernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void NEBox3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
 {
     if(use_fp16)
@@ -45,5 +46,8 @@
         k->configure(input, output, border_mode == BorderMode::UNDEFINED);
         _kernel = std::move(k);
     }
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler = std::move(b);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index d7ec52c..bf4f7d7 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp
@@ -25,8 +25,6 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
@@ -34,13 +32,19 @@
 #include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
 #include "arm_compute/runtime/NEON/functions/NESobel7x7.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NECannyEdgeKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NESobel5x5Kernel.h"
+#include "src/core/NEON/kernels/NESobel7x7Kernel.h"
 #include "support/MemorySupport.h"
 
 #include <cstring>
 #include <inttypes.h>
 #include <utility>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NECannyEdge::~NECannyEdge() = default;
 
 NECannyEdge::NECannyEdge(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
     : _memory_group(std::move(memory_manager)),
@@ -139,21 +143,25 @@
     _memory_group.manage(&_nonmax);
 
     // Configure non-maxima suppression
-    _non_max_suppr.configure(&_magnitude, &_phase, &_nonmax, upper_thr, lower_thr, border_mode == BorderMode::UNDEFINED);
+    _non_max_suppr = arm_compute::support::cpp14::make_unique<NEEdgeNonMaxSuppressionKernel>();
+    _non_max_suppr->configure(&_magnitude, &_phase, &_nonmax, upper_thr, lower_thr, border_mode == BorderMode::UNDEFINED);
 
     // Fill border around magnitude image as non-maxima suppression will access
     // it. If border mode is undefined filling the border is a nop.
-    _border_mag_gradient.configure(&_magnitude, _non_max_suppr.border_size(), border_mode, constant_border_value);
+    _border_mag_gradient = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    _border_mag_gradient->configure(&_magnitude, _non_max_suppr->border_size(), border_mode, constant_border_value);
 
     // Allocate intermediate tensors
     _phase.allocator()->allocate();
     _magnitude.allocator()->allocate();
 
     // Configure edge tracing
-    _edge_trace.configure(&_nonmax, output);
+    _edge_trace = arm_compute::support::cpp14::make_unique<NEEdgeTraceKernel>();
+    _edge_trace->configure(&_nonmax, output);
 
     // Fill border with "No edge" to stop recursion in edge trace
-    _border_edge_trace.configure(&_nonmax, _edge_trace.border_size(), BorderMode::CONSTANT, static_cast<float>(0.f));
+    _border_edge_trace = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    _border_edge_trace->configure(&_nonmax, _edge_trace->border_size(), BorderMode::CONSTANT, static_cast<float>(0.f));
 
     // Allocate intermediate tensors
     _nonmax.allocator()->allocate();
@@ -172,17 +180,18 @@
     NEScheduler::get().schedule(_gradient.get(), Window::DimY);
 
     // Fill border before non-maxima suppression. Nop for border mode undefined.
-    NEScheduler::get().schedule(&_border_mag_gradient, Window::DimZ);
+    NEScheduler::get().schedule(_border_mag_gradient.get(), Window::DimZ);
 
     // Run non-maxima suppression
-    NEScheduler::get().schedule(&_non_max_suppr, Window::DimY);
+    NEScheduler::get().schedule(_non_max_suppr.get(), Window::DimY);
 
     ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
     std::fill_n(_output->buffer(), _output->info()->total_size(), 0);
 
     // Fill border before edge trace
-    NEScheduler::get().schedule(&_border_edge_trace, Window::DimZ);
+    NEScheduler::get().schedule(_border_edge_trace.get(), Window::DimZ);
 
     // Run edge tracing
-    NEScheduler::get().schedule(&_edge_trace, Window::DimY);
+    NEScheduler::get().schedule(_edge_trace.get(), Window::DimY);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NECast.cpp b/src/runtime/NEON/functions/NECast.cpp
index 4b35110..7fd2605 100644
--- a/src/runtime/NEON/functions/NECast.cpp
+++ b/src/runtime/NEON/functions/NECast.cpp
@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/NEON/functions/NECast.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "src/core/NEON/kernels/NEDepthConvertLayerKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEChannelCombine.cpp b/src/runtime/NEON/functions/NEChannelCombine.cpp
index e987951..f8a9be0 100644
--- a/src/runtime/NEON/functions/NEChannelCombine.cpp
+++ b/src/runtime/NEON/functions/NEChannelCombine.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEChannelCombine.h"
 
-#include "arm_compute/core/NEON/kernels/NEChannelCombineKernel.h"
+#include "src/core/NEON/kernels/NEChannelCombineKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEChannelExtract.cpp b/src/runtime/NEON/functions/NEChannelExtract.cpp
index d78a8f8..8f5e4d4 100644
--- a/src/runtime/NEON/functions/NEChannelExtract.cpp
+++ b/src/runtime/NEON/functions/NEChannelExtract.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEChannelExtract.h"
 
-#include "arm_compute/core/NEON/kernels/NEChannelExtractKernel.h"
+#include "src/core/NEON/kernels/NEChannelExtractKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
index 0392a92..c72dec6 100644
--- a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
+++ b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NECol2Im.cpp b/src/runtime/NEON/functions/NECol2Im.cpp
index e4fe36f..0706125 100644
--- a/src/runtime/NEON/functions/NECol2Im.cpp
+++ b/src/runtime/NEON/functions/NECol2Im.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NECol2Im.h"
 
-#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
+#include "src/core/NEON/kernels/NECol2ImKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEColorConvert.cpp b/src/runtime/NEON/functions/NEColorConvert.cpp
index 7befac7..ebdd104 100644
--- a/src/runtime/NEON/functions/NEColorConvert.cpp
+++ b/src/runtime/NEON/functions/NEColorConvert.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEColorConvert.h"
 
-#include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h"
+#include "src/core/NEON/kernels/NEColorConvertKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEComputeAllAnchors.cpp b/src/runtime/NEON/functions/NEComputeAllAnchors.cpp
index cb89117..3f5712d 100644
--- a/src/runtime/NEON/functions/NEComputeAllAnchors.cpp
+++ b/src/runtime/NEON/functions/NEComputeAllAnchors.cpp
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEComputeAllAnchors.h"
 
+#include "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEConcatenateLayer.cpp b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
index 72bd9e6..03a01ae 100644
--- a/src/runtime/NEON/functions/NEConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
@@ -23,10 +23,10 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEWidthConcatenateLayerKernel.h"
+#include "src/core/NEON/kernels/NEBatchConcatenateLayerKernel.h"
+#include "src/core/NEON/kernels/NEDepthConcatenateLayerKernel.h"
+#include "src/core/NEON/kernels/NEHeightConcatenateLayerKernel.h"
+#include "src/core/NEON/kernels/NEWidthConcatenateLayerKernel.h"
 
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
diff --git a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
index f697efb..291afe0 100644
--- a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
+++ b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,9 +22,13 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
+#include "src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NEConvertFullyConnectedWeights::~NEConvertFullyConnectedWeights() = default;
+
 NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights()
     : _kernel()
 {
@@ -33,7 +37,8 @@
 void NEConvertFullyConnectedWeights::configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape,
                                                DataLayout data_layout)
 {
-    _kernel.configure(input, output, original_input_shape, data_layout);
+    _kernel = arm_compute::support::cpp14::make_unique<NEConvertFullyConnectedWeightsKernel>();
+    _kernel->configure(input, output, original_input_shape, data_layout);
 }
 
 Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
@@ -44,6 +49,6 @@
 
 void NEConvertFullyConnectedWeights::run()
 {
-    NEScheduler::get().schedule(&_kernel, Window::DimZ);
+    NEScheduler::get().schedule(_kernel.get(), Window::DimZ);
 }
 } // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEConvolution.cpp b/src/runtime/NEON/functions/NEConvolution.cpp
index 8200a08..07ac8bd 100644
--- a/src/runtime/NEON/functions/NEConvolution.cpp
+++ b/src/runtime/NEON/functions/NEConvolution.cpp
@@ -25,29 +25,39 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NEConvolutionKernel.h"
+#include "src/core/NEON/kernels/NEConvolutionKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "support/MemorySupport.h"
 
 #include <array>
 #include <utility>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEConvolution3x3::~NEConvolution3x3() = default;
 
 void NEConvolution3x3::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEConvolution3x3Kernel>();
     k->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler = std::move(b);
 }
 
 template <unsigned int matrix_size>
+NEConvolutionSquare<matrix_size>::~NEConvolutionSquare() = default;
+
+template <unsigned int matrix_size>
 NEConvolutionSquare<matrix_size>::NEConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
 {
@@ -66,6 +76,7 @@
 
     _is_separable = separate_matrix(conv, conv_col.data(), conv_row.data(), matrix_size);
 
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
     if(_is_separable)
     {
         DataType intermediate_type = DataType::UNKNOWN;
@@ -82,35 +93,40 @@
             scale = calculate_matrix_scale(conv, matrix_size);
         }
 
-        _kernel_hor.configure(input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED);
-        _kernel_vert.configure(&_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED);
+        _kernel_hor  = arm_compute::support::cpp14::make_unique<NESeparableConvolutionHorKernel<matrix_size>>();
+        _kernel_vert = arm_compute::support::cpp14::make_unique<NESeparableConvolutionVertKernel<matrix_size>>();
+
+        _kernel_hor->configure(input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED);
+        _kernel_vert->configure(&_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED);
 
         _tmp.allocator()->allocate();
 
-        _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+        b->configure(input, _kernel_hor->border_size(), border_mode, PixelValue(constant_border_value));
     }
     else
     {
-        _kernel.configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
-        _border_handler.configure(input, _kernel.border_size(), border_mode, PixelValue(constant_border_value));
+        _kernel = arm_compute::support::cpp14::make_unique<NEConvolutionKernel<matrix_size>>();
+        _kernel->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
+        b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
     }
+    _border_handler = std::move(b);
 }
 
 template <unsigned int matrix_size>
 void                   NEConvolutionSquare<matrix_size>::run()
 {
-    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+    NEScheduler::get().schedule(_border_handler.get(), Window::DimZ);
 
     if(_is_separable)
     {
         MemoryGroupResourceScope scope_mg(_memory_group);
 
-        NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
-        NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
+        NEScheduler::get().schedule(_kernel_hor.get(), Window::DimY);
+        NEScheduler::get().schedule(_kernel_vert.get(), Window::DimY);
     }
     else
     {
-        NEScheduler::get().schedule(&_kernel, Window::DimY);
+        NEScheduler::get().schedule(_kernel.get(), Window::DimY);
     }
 }
 
@@ -118,10 +134,16 @@
 template class arm_compute::NEConvolutionSquare<7>;
 template class arm_compute::NEConvolutionSquare<9>;
 
+NEConvolutionRectangle::~NEConvolutionRectangle() = default;
+
 void NEConvolutionRectangle::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEConvolutionRectangleKernel>();
     k->configure(input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler = std::move(b);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 491425c..901b1e8 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -27,6 +27,27 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NECol2ImKernel.h"
+#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+#include "src/core/NEON/kernels/NECopyKernel.h"
+#include "src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
+#include "src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
+#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
+#include "src/core/NEON/kernels/NEFFTScaleKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "src/core/NEON/kernels/NEIm2ColKernel.h"
+#include "src/core/NEON/kernels/NEPadLayerKernel.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
+#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
 #include "support/MemorySupport.h"
 
 #include <cmath>
diff --git a/src/runtime/NEON/functions/NECopy.cpp b/src/runtime/NEON/functions/NECopy.cpp
index a461c18..9e7bf40 100644
--- a/src/runtime/NEON/functions/NECopy.cpp
+++ b/src/runtime/NEON/functions/NECopy.cpp
@@ -23,13 +23,15 @@
  */
 #include "arm_compute/runtime/NEON/functions/NECopy.h"
 
-#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
+#include "src/core/NEON/kernels/NECopyKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
 namespace arm_compute
 {
+NECopy::~NECopy() = default;
+
 void NECopy::configure(ITensor *input, ITensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<NECopyKernel>();
diff --git a/src/runtime/NEON/functions/NECropResize.cpp b/src/runtime/NEON/functions/NECropResize.cpp
index f8f9916..2e2d225 100644
--- a/src/runtime/NEON/functions/NECropResize.cpp
+++ b/src/runtime/NEON/functions/NECropResize.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include "arm_compute/runtime/NEON/functions/NECropResize.h"
+#include "src/core/NEON/kernels/NECropKernel.h"
 
 #include "support/MemorySupport.h"
 
@@ -31,6 +32,8 @@
 
 namespace arm_compute
 {
+NECropResize::~NECropResize() = default;
+
 NECropResize::NECropResize()
     : _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _crop(), _scale(), _crop_results(), _scaled_results()
 {
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index cb9ab16..2b5b008 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
 using namespace arm_compute::misc::shape_calculator;
diff --git a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
index 1ffcca0..af0f5ef 100644
--- a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h"
+#include "src/core/NEON/kernels/NEDepthConvertLayerKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
index 0aaa37e..c4f15e3 100644
--- a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
 
 #include "support/MemorySupport.h"
 
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 6c22523..fc97279 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -27,6 +27,8 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute::misc;
 using namespace arm_compute::misc::shape_calculator;
@@ -69,10 +71,11 @@
 }
 } // namespace
 
+NEDepthwiseConvolutionLayer::~NEDepthwiseConvolutionLayer() = default;
+
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _dwc_optimized_func(memory_manager), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), _activationlayer_function(),
-      _accumulator(), _permuted_input(), _permuted_weights(), _permuted_output(), _original_weights(nullptr), _has_bias(false), _is_quantized(false), _is_nchw(true), _permute(false),
-      _is_activationlayer_enabled(false), _is_prepared(false)
+    : _memory_group(memory_manager), _dwc_optimized_func(memory_manager), _permute_input(), _permute_weights(), _permute_output(), _activationlayer_function(), _accumulator(), _permuted_input(),
+      _permuted_weights(), _permuted_output(), _original_weights(nullptr), _has_bias(false), _is_quantized(false), _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
 {
 }
 
@@ -243,7 +246,8 @@
     }
     _original_weights = weights_to_use;
 
-    _depthwise_conv_kernel.configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, dilation);
+    _depthwise_conv_kernel = arm_compute::support::cpp14::make_unique<NEDepthwiseConvolutionLayerNativeKernel>();
+    _depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, dilation);
 
     if(_is_nchw)
     {
@@ -309,7 +313,7 @@
         _permute_input.run();
     }
 
-    NEScheduler::get().schedule(&_depthwise_conv_kernel, Window::DimY);
+    NEScheduler::get().schedule(_depthwise_conv_kernel.get(), Window::DimY);
 
     if(_is_nchw)
     {
diff --git a/src/runtime/NEON/functions/NEDequantizationLayer.cpp b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
index a4a3a43..0c0f86c 100644
--- a/src/runtime/NEON/functions/NEDequantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
@@ -24,7 +24,7 @@
 
 #include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h"
+#include "src/core/NEON/kernels/NEDequantizationLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDerivative.cpp b/src/runtime/NEON/functions/NEDerivative.cpp
index 2499140..f007e9f 100644
--- a/src/runtime/NEON/functions/NEDerivative.cpp
+++ b/src/runtime/NEON/functions/NEDerivative.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,12 +24,16 @@
 #include "arm_compute/runtime/NEON/functions/NEDerivative.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/NEON/kernels/NEDerivativeKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEDerivativeKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEDerivative::~NEDerivative() = default;
 
 NEDerivative::NEDerivative()
     : _kernel(), _border_handler()
@@ -41,12 +45,16 @@
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
 
-    _kernel.configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
-    _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
+    _kernel         = arm_compute::support::cpp14::make_unique<NEDerivativeKernel>();
+    _border_handler = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+
+    _kernel->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+    _border_handler->configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
 }
 
 void NEDerivative::run()
 {
-    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
-    NEScheduler::get().schedule(&_kernel, Window::DimY);
+    NEScheduler::get().schedule(_border_handler.get(), Window::DimZ);
+    NEScheduler::get().schedule(_kernel.get(), Window::DimY);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDilate.cpp b/src/runtime/NEON/functions/NEDilate.cpp
index 7f50386..70c0b61 100644
--- a/src/runtime/NEON/functions/NEDilate.cpp
+++ b/src/runtime/NEON/functions/NEDilate.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEDilate.h"
 
-#include "arm_compute/core/NEON/kernels/NEDilateKernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/NEON/kernels/NEDilateKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -36,5 +37,8 @@
     auto k = arm_compute::support::cpp14::make_unique<NEDilateKernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler = std::move(b);
 }
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index fe54590..98d6386 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -27,9 +27,15 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
+#include "src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NEDirectConvolutionLayer::~NEDirectConvolutionLayer() = default;
+
 NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false),
       _is_activationlayer_enabled(false), _dim_split(Window::DimZ), _is_padding_required()
@@ -39,6 +45,9 @@
 void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN);
+    _output_stage_kernel  = arm_compute::support::cpp14::make_unique<NEDirectConvolutionLayerOutputStageKernel>();
+    _conv_kernel          = arm_compute::support::cpp14::make_unique<NEDirectConvolutionLayerKernel>();
+    _input_border_handler = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
 
     // Free accumulator
     if(_accumulator.buffer() != nullptr)
@@ -51,17 +60,17 @@
     // Check if bias should be added in the convolution result
     _has_bias = (bias != nullptr);
 
-    _conv_kernel.configure(input, weights, output, conv_info);
+    _conv_kernel->configure(input, weights, output, conv_info);
     if(_has_bias)
     {
-        _output_stage_kernel.configure(output, bias);
+        _output_stage_kernel->configure(output, bias);
     }
-    _is_padding_required = !_conv_kernel.border_size().empty();
+    _is_padding_required = !_conv_kernel->border_size().empty();
 
     if(_is_padding_required)
     {
         // Add zero padding XY
-        _input_border_handler.configure(input, _conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+        _input_border_handler->configure(input, _conv_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
     }
 
     //Configure Activation Layer
@@ -109,12 +118,12 @@
 
     if(_is_padding_required)
     {
-        NEScheduler::get().schedule(&_input_border_handler, Window::DimZ);
+        NEScheduler::get().schedule(_input_border_handler.get(), Window::DimZ);
     }
-    NEScheduler::get().schedule(&_conv_kernel, _dim_split);
+    NEScheduler::get().schedule(_conv_kernel.get(), _dim_split);
     if(_has_bias)
     {
-        NEScheduler::get().schedule(&_output_stage_kernel, Window::DimY);
+        NEScheduler::get().schedule(_output_stage_kernel.get(), Window::DimY);
     }
 
     if(_is_activationlayer_enabled)
diff --git a/src/runtime/NEON/functions/NEElementwiseOperators.cpp b/src/runtime/NEON/functions/NEElementwiseOperators.cpp
index d1f60c7..7f3fe8b 100644
--- a/src/runtime/NEON/functions/NEElementwiseOperators.cpp
+++ b/src/runtime/NEON/functions/NEElementwiseOperators.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/functions/NEElementwiseOperations.h"
-#include <arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h>
+#include <src/core/NEON/kernels/NEElementwiseOperationKernel.h>
 
 #include "arm_compute/core/ITensor.h"
 #include "support/MemorySupport.h"
diff --git a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
index cb4e3a0..5e13020 100644
--- a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
+++ b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h"
+#include "src/core/NEON/kernels/NEElementwiseUnaryKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEEqualizeHistogram.cpp b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
index b3d5ad4..d3ff171 100644
--- a/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
+++ b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,8 +28,15 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NECumulativeDistributionKernel.h"
+#include "src/core/NEON/kernels/NEHistogramKernel.h"
+#include "src/core/NEON/kernels/NEHistogramKernel.h"
+#include "src/core/NEON/kernels/NETableLookupKernel.h"
+#include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEEqualizeHistogram::~NEEqualizeHistogram() = default;
 
 NEEqualizeHistogram::NEEqualizeHistogram()
     : _histogram_kernel(), _cd_histogram_kernel(), _map_histogram_kernel(), _hist(nr_bins, 0, max_range), _cum_dist(nr_bins, 0, max_range), _cd_lut(nr_bins, DataType::U8)
@@ -43,20 +50,25 @@
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
 
+    _histogram_kernel     = arm_compute::support::cpp14::make_unique<NEHistogramKernel>();
+    _cd_histogram_kernel  = arm_compute::support::cpp14::make_unique<NECumulativeDistributionKernel>();
+    _map_histogram_kernel = arm_compute::support::cpp14::make_unique<NETableLookupKernel>();
+
     // Configure kernels
-    _histogram_kernel.configure(input, &_hist);
-    _cd_histogram_kernel.configure(input, &_hist, &_cum_dist, &_cd_lut);
-    _map_histogram_kernel.configure(input, &_cd_lut, output);
+    _histogram_kernel->configure(input, &_hist);
+    _cd_histogram_kernel->configure(input, &_hist, &_cum_dist, &_cd_lut);
+    _map_histogram_kernel->configure(input, &_cd_lut, output);
 }
 
 void NEEqualizeHistogram::run()
 {
     // Calculate histogram of input.
-    NEScheduler::get().schedule(&_histogram_kernel, Window::DimY);
+    NEScheduler::get().schedule(_histogram_kernel.get(), Window::DimY);
 
     // Calculate cumulative distribution of histogram and create LUT.
-    NEScheduler::get().schedule(&_cd_histogram_kernel, Window::DimY);
+    NEScheduler::get().schedule(_cd_histogram_kernel.get(), Window::DimY);
 
     // Map input to output using created LUT.
-    NEScheduler::get().schedule(&_map_histogram_kernel, Window::DimY);
+    NEScheduler::get().schedule(_map_histogram_kernel.get(), Window::DimY);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEErode.cpp b/src/runtime/NEON/functions/NEErode.cpp
index a89993c..748694f 100644
--- a/src/runtime/NEON/functions/NEErode.cpp
+++ b/src/runtime/NEON/functions/NEErode.cpp
@@ -23,18 +23,23 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEErode.h"
 
-#include "arm_compute/core/NEON/kernels/NEErodeKernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/NEON/kernels/NEErodeKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void NEErode::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEErodeKernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler = std::move(b);
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEFFT1D.cpp b/src/runtime/NEON/functions/NEFFT1D.cpp
index 2c53b18..b94c258 100644
--- a/src/runtime/NEON/functions/NEFFT1D.cpp
+++ b/src/runtime/NEON/functions/NEFFT1D.cpp
@@ -26,10 +26,16 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
+#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
+#include "src/core/NEON/kernels/NEFFTScaleKernel.h"
 #include "src/core/utils/helpers/fft.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NEFFT1D::~NEFFT1D() = default;
+
 NEFFT1D::NEFFT1D(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _digit_reverse_kernel(), _fft_kernels(), _scale_kernel(), _digit_reversed_input(), _digit_reverse_indices(), _num_ffts(0), _axis(0), _run_scale(false)
 {
@@ -58,7 +64,8 @@
     TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32);
     _digit_reverse_indices.allocator()->init(digit_reverse_indices_info);
     _memory_group.manage(&_digit_reversed_input);
-    _digit_reverse_kernel.configure(input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
+    _digit_reverse_kernel = arm_compute::support::cpp14::make_unique<NEFFTDigitReverseKernel>();
+    _digit_reverse_kernel->configure(input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
 
     // Create and configure FFT kernels
     unsigned int Nx = 1;
@@ -75,7 +82,8 @@
         fft_kernel_info.radix          = radix_for_stage;
         fft_kernel_info.Nx             = Nx;
         fft_kernel_info.is_first_stage = (i == 0);
-        _fft_kernels[i].configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
+        _fft_kernels[i]                = arm_compute::support::cpp14::make_unique<NEFFTRadixStageKernel>();
+        _fft_kernels[i]->configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
 
         Nx *= radix_for_stage;
     }
@@ -86,7 +94,8 @@
         FFTScaleKernelInfo scale_config;
         scale_config.scale     = static_cast<float>(N);
         scale_config.conjugate = config.direction == FFTDirection::Inverse;
-        is_c2r ? _scale_kernel.configure(&_digit_reversed_input, output, scale_config) : _scale_kernel.configure(output, nullptr, scale_config);
+        _scale_kernel          = arm_compute::support::cpp14::make_unique<NEFFTScaleKernel>();
+        is_c2r ? _scale_kernel->configure(&_digit_reversed_input, output, scale_config) : _scale_kernel->configure(output, nullptr, scale_config);
     }
 
     // Allocate tensors
@@ -128,17 +137,17 @@
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    NEScheduler::get().schedule(&_digit_reverse_kernel, (_axis == 0 ? Window::DimY : Window::DimZ));
+    NEScheduler::get().schedule(_digit_reverse_kernel.get(), (_axis == 0 ? Window::DimY : Window::DimZ));
 
     for(unsigned int i = 0; i < _num_ffts; ++i)
     {
-        NEScheduler::get().schedule(&_fft_kernels[i], (_axis == 0 ? Window::DimY : Window::DimX));
+        NEScheduler::get().schedule(_fft_kernels[i].get(), (_axis == 0 ? Window::DimY : Window::DimX));
     }
 
     // Run output scaling
     if(_run_scale)
     {
-        NEScheduler::get().schedule(&_scale_kernel, Window::DimY);
+        NEScheduler::get().schedule(_scale_kernel.get(), Window::DimY);
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFFT2D.cpp b/src/runtime/NEON/functions/NEFFT2D.cpp
index b63afe5..3b787cd 100644
--- a/src/runtime/NEON/functions/NEFFT2D.cpp
+++ b/src/runtime/NEON/functions/NEFFT2D.cpp
@@ -26,9 +26,14 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Scheduler.h"
+#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
+#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
+#include "src/core/NEON/kernels/NEFFTScaleKernel.h"
 
 namespace arm_compute
 {
+NEFFT2D::~NEFFT2D() = default;
+
 NEFFT2D::NEFFT2D(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor()
 {
diff --git a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
index a46fc9f..23788b7 100644
--- a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
@@ -27,6 +27,12 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/NEON/kernels/NECopyKernel.h"
+#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
+#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
+#include "src/core/NEON/kernels/NEFFTScaleKernel.h"
+#include "src/core/NEON/kernels/NEPadLayerKernel.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/utils/helpers/fft.h"
 
@@ -96,6 +102,7 @@
       _is_prepared(false)
 {
 }
+NEFFTConvolutionLayer::~NEFFTConvolutionLayer() = default;
 
 void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
                                       const ActivationLayerInfo &act_info)
diff --git a/src/runtime/NEON/functions/NEFastCorners.cpp b/src/runtime/NEON/functions/NEFastCorners.cpp
index 303c593..1bde3cc 100644
--- a/src/runtime/NEON/functions/NEFastCorners.cpp
+++ b/src/runtime/NEON/functions/NEFastCorners.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,15 +25,21 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Array.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NEFastCornersKernel.h"
+#include "src/core/NEON/kernels/NEFillArrayKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
+#include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEFastCorners::~NEFastCorners() = default;
 
 NEFastCorners::NEFastCorners(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
@@ -62,24 +68,28 @@
     _output.allocator()->init(tensor_info);
     _memory_group.manage(&_output);
 
+    _fast_corners_kernel = arm_compute::support::cpp14::make_unique<NEFastCornersKernel>();
+    _border_handler      = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    _fill_kernel         = arm_compute::support::cpp14::make_unique<NEFillArrayKernel>();
     // If border is UNDEFINED _fast_corners_kernel will operate in xwindow (3,
     // width - 3) and ywindow (3, height -3) so the output image will leave the
     // pixels on the borders unchanged. This is reflected in the valid region
     // of the output. The non maxima suppression is only run on the valid
     // pixels.
-    _fast_corners_kernel.configure(input, &_output, threshold, nonmax_suppression, BorderMode::UNDEFINED == border_mode);
-    _border_handler.configure(input, _fast_corners_kernel.border_size(), border_mode, constant_border_value);
+    _fast_corners_kernel->configure(input, &_output, threshold, nonmax_suppression, BorderMode::UNDEFINED == border_mode);
+    _border_handler->configure(input, _fast_corners_kernel->border_size(), border_mode, constant_border_value);
 
     if(!_non_max)
     {
-        _fill_kernel.configure(&_output, 1 /* we keep all texels >0 */, corners);
+        _fill_kernel->configure(&_output, 1 /* we keep all texels >0 */, corners);
     }
     else
     {
         _suppressed.allocator()->init(tensor_info);
         _memory_group.manage(&_suppressed);
-        _nonmax_kernel.configure(&_output, &_suppressed, BorderMode::UNDEFINED == border_mode);
-        _fill_kernel.configure(&_suppressed, 1 /* we keep all texels >0 */, corners);
+        _nonmax_kernel = arm_compute::support::cpp14::make_unique<NENonMaximaSuppression3x3Kernel>();
+        _nonmax_kernel->configure(&_output, &_suppressed, BorderMode::UNDEFINED == border_mode);
+        _fill_kernel->configure(&_suppressed, 1 /* we keep all texels >0 */, corners);
 
         // Allocate intermediate tensors
         _suppressed.allocator()->allocate();
@@ -91,16 +101,17 @@
 
 void NEFastCorners::run()
 {
-    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+    NEScheduler::get().schedule(_border_handler.get(), Window::DimZ);
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    NEScheduler::get().schedule(&_fast_corners_kernel, Window::DimY);
+    NEScheduler::get().schedule(_fast_corners_kernel.get(), Window::DimY);
 
     if(_non_max)
     {
-        NEScheduler::get().schedule(&_nonmax_kernel, Window::DimY);
+        NEScheduler::get().schedule(_nonmax_kernel.get(), Window::DimY);
     }
 
-    NEScheduler::get().schedule(&_fill_kernel, Window::DimY);
+    NEScheduler::get().schedule(_fill_kernel.get(), Window::DimY);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFill.cpp b/src/runtime/NEON/functions/NEFill.cpp
index 79fe175..68292c9 100644
--- a/src/runtime/NEON/functions/NEFill.cpp
+++ b/src/runtime/NEON/functions/NEFill.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEMemsetKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEFillBorder.cpp b/src/runtime/NEON/functions/NEFillBorder.cpp
index de2ef26..e96069f 100644
--- a/src/runtime/NEON/functions/NEFillBorder.cpp
+++ b/src/runtime/NEON/functions/NEFillBorder.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,16 +25,19 @@
 
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 void NEFillBorder::configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
 {
-    _border_handler.configure(input, BorderSize(border_width), border_mode, constant_border_value);
+    _border_handler = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    _border_handler->configure(input, BorderSize(border_width), border_mode, constant_border_value);
 }
 
 void NEFillBorder::run()
 {
-    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+    NEScheduler::get().schedule(_border_handler.get(), Window::DimZ);
 }
 } // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEFlattenLayer.cpp b/src/runtime/NEON/functions/NEFlattenLayer.cpp
index 936a70d..4dfe963 100644
--- a/src/runtime/NEON/functions/NEFlattenLayer.cpp
+++ b/src/runtime/NEON/functions/NEFlattenLayer.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
 #include "arm_compute/core/Size2D.h"
+#include "src/core/NEON/kernels/NEFlattenLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFloor.cpp b/src/runtime/NEON/functions/NEFloor.cpp
index 95b2497..5f6bd61 100644
--- a/src/runtime/NEON/functions/NEFloor.cpp
+++ b/src/runtime/NEON/functions/NEFloor.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEFloor.h"
 
-#include "arm_compute/core/NEON/kernels/NEFloorKernel.h"
+#include "src/core/NEON/kernels/NEFloorKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index d956d16..714fa58 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -29,6 +29,19 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
+#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+#include "src/core/NEON/kernels/NEFlattenLayerKernel.h"
+#include "src/core/NEON/kernels/NEFlattenLayerKernel.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "src/core/NEON/kernels/NETransposeKernel.h"
 
 #include "support/MemorySupport.h"
 
@@ -145,6 +158,8 @@
     return NETransposeKernel::validate(input, output);
 }
 
+NEFullyConnectedLayer::~NEFullyConnectedLayer() = default;
+
 NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
     : _memory_group(std::move(memory_manager)), _weights_manager(weights_manager), _flatten_kernel(), _convert_weights(), _convert_weights_managed(), _reshape_weights_function(),
       _reshape_weights_managed_function(), _mm_gemm(nullptr, weights_manager), _mm_gemmlowp(nullptr, weights_manager), _flatten_output(), _converted_weights_output(), _reshape_weights_output(),
@@ -199,7 +214,9 @@
 
     // Configure flatten kernel
     _memory_group.manage(&_flatten_output);
-    _flatten_kernel.configure(input, &_flatten_output);
+
+    _flatten_kernel = arm_compute::support::cpp14::make_unique<NEFlattenLayerKernel>();
+    _flatten_kernel->configure(input, &_flatten_output);
 
     // Configure matrix multiply kernel
     configure_mm(&_flatten_output, weights, biases, output, act);
@@ -398,7 +415,7 @@
     // Linearize input if it comes from a convolutional layer
     if(_is_fc_after_conv)
     {
-        NEScheduler::get().schedule(&_flatten_kernel, Window::DimY);
+        NEScheduler::get().schedule(_flatten_kernel.get(), Window::DimY);
     }
 
     // Run matrix multiply
diff --git a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
index fd26bb4..c64fde0 100644
--- a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
+++ b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,9 +28,13 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NEFuseBatchNormalization::~NEFuseBatchNormalization() = default;
+
 NEFuseBatchNormalization::NEFuseBatchNormalization()
     : _fuse_bn_kernel()
 {
@@ -41,7 +45,8 @@
                                          const ITensor *input_bias, const ITensor *bn_beta, const ITensor *bn_gamma,
                                          float epsilon, FuseBatchNormalizationType fbn_type)
 {
-    _fuse_bn_kernel.configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    _fuse_bn_kernel = arm_compute::support::cpp14::make_unique<NEFuseBatchNormalizationKernel>();
+    _fuse_bn_kernel->configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
 Status NEFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
@@ -54,6 +59,6 @@
 
 void NEFuseBatchNormalization::run()
 {
-    NEScheduler::get().schedule(&_fuse_bn_kernel, Window::DimY);
+    NEScheduler::get().schedule(_fuse_bn_kernel.get(), Window::DimY);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 4166cff..0215098 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -34,7 +34,12 @@
 #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
+#include "support/MemorySupport.h"
 
 #include <cmath>
 
@@ -42,6 +47,8 @@
 
 namespace arm_compute
 {
+NEGEMM::~NEGEMM() = default;
+
 NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
     : _memory_group(memory_manager), _weights_manager(weights_manager), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(memory_manager, weights_manager), _ma_kernel(),
       _alpha_scale_func(nullptr), _add_bias(), _activation_func(), _tmp_a(), _tmp_b(), _tmp_d(), _original_b(nullptr), _run_vector_matrix_multiplication(false), _run_alpha_scale(false),
@@ -88,11 +95,13 @@
             _memory_group.manage(&_tmp_d);
         }
 
+        _mm_kernel = arm_compute::support::cpp14::make_unique<NEGEMMMatrixMultiplyKernel>();
+
         // Select between GEMV and GEMM
         if(_run_vector_matrix_multiplication)
         {
             // Configure the matrix multiply kernel
-            _mm_kernel.configure(a, b, gemm_output_to_use, alpha, false);
+            _mm_kernel->configure(a, b, gemm_output_to_use, alpha, false);
         }
         else
         {
@@ -124,13 +133,15 @@
             int k = a->info()->dimension(0);
 
             // Configure interleave kernel
-            _interleave_kernel.configure(a, &_tmp_a);
+            _interleave_kernel = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+            _interleave_kernel->configure(a, &_tmp_a);
 
             // Configure transpose kernel
-            _transpose_kernel.configure(b, &_tmp_b);
+            _transpose_kernel = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+            _transpose_kernel->configure(b, &_tmp_b);
 
             // Configure matrix multiplication kernel
-            _mm_kernel.configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k));
+            _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k));
 
             // Allocate once the all configure methods have been called
             _tmp_a.allocator()->allocate();
@@ -150,7 +161,8 @@
     // Configure matrix addition kernel
     if(_run_addition)
     {
-        _ma_kernel.configure(c, d, beta);
+        _ma_kernel = arm_compute::support::cpp14::make_unique<NEGEMMMatrixAdditionKernel>();
+        _ma_kernel->configure(c, d, beta);
     }
 
     // Configure activation
@@ -298,16 +310,16 @@
         if(!_run_vector_matrix_multiplication)
         {
             // Run interleave kernel
-            NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
+            NEScheduler::get().schedule(_interleave_kernel.get(), Window::DimY);
 
             if(!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
-                NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+                NEScheduler::get().schedule(_transpose_kernel.get(), Window::DimY);
             }
         }
 
-        NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
+        NEScheduler::get().schedule(_mm_kernel.get(), _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
 
         // Run bias addition kernel
         if(_run_bias_addition)
@@ -319,7 +331,7 @@
     // Run matrix addition kernel
     if(_run_addition)
     {
-        NEScheduler::get().schedule(&_ma_kernel, Window::DimY);
+        NEScheduler::get().schedule(_ma_kernel.get(), Window::DimY);
     }
 
     // Run activation function
@@ -355,7 +367,7 @@
             }
 
             _tmp_b.allocator()->allocate();
-            NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+            NEScheduler::get().schedule(_transpose_kernel.get(), Window::DimY);
             if(!original_b_managed_by_weights_manager)
             {
                 _original_b->mark_as_unused();
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index 834a66a..3f50f81 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -30,6 +30,21 @@
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "src/core/NEON/kernels/NECol2ImKernel.h"
+#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "src/core/NEON/kernels/NEIm2ColKernel.h"
+#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
+#include "support/MemorySupport.h"
+
 #include <set>
 #include <tuple>
 
@@ -37,6 +52,7 @@
 {
 using namespace arm_compute::misc::shape_calculator;
 
+NEConvolutionLayerReshapeWeights::~NEConvolutionLayerReshapeWeights() = default;
 NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights()
     : _weights_reshape_kernel()
 {
@@ -52,7 +68,8 @@
     const bool     append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
     const ITensor *biases_to_use = (append_biases) ? biases : nullptr;
 
-    _weights_reshape_kernel.configure(weights, biases_to_use, output);
+    _weights_reshape_kernel = arm_compute::support::cpp14::make_unique<NEWeightsReshapeKernel>();
+    _weights_reshape_kernel->configure(weights, biases_to_use, output);
 
     output->info()->set_quantization_info(weights->info()->quantization_info());
 }
@@ -86,9 +103,11 @@
 
 void NEConvolutionLayerReshapeWeights::run()
 {
-    NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+    NEScheduler::get().schedule(_weights_reshape_kernel.get(), 3);
 }
 
+NEGEMMConvolutionLayer::~NEGEMMConvolutionLayer() = default;
+
 NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager, IWeightsManager *weights_manager)
     : _memory_group(memory_manager), _weights_manager(weights_manager), _reshape_weights(), _reshape_weights_managed(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager),
       _col2im_kernel(), _reshape_layer(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _data_layout(DataLayout::NCHW), _skip_im2col(false),
@@ -323,7 +342,8 @@
         _memory_group.manage(&_im2col_output);
 
         // Configure
-        _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation);
+        _im2col_kernel = arm_compute::support::cpp14::make_unique<NEIm2ColKernel>();
+        _im2col_kernel->configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation);
 
         // Update GEMM input
         gemm_input_to_use = &_im2col_output;
@@ -365,7 +385,8 @@
         if(_data_layout == DataLayout::NCHW)
         {
             // Configure col2im
-            _col2im_kernel.configure(gemm_output_to_use, output, Size2D(conv_w, conv_h));
+            _col2im_kernel = arm_compute::support::cpp14::make_unique<NECol2ImKernel>();
+            _col2im_kernel->configure(gemm_output_to_use, output, Size2D(conv_w, conv_h));
         }
         else
         {
@@ -538,7 +559,7 @@
     {
         // Run input reshaping
         unsigned int y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-        NEScheduler::get().schedule(&_im2col_kernel, y_dim);
+        NEScheduler::get().schedule(_im2col_kernel.get(), y_dim);
     }
 
     // Runs NEGEMM or NEGEMMLowpMatrixMultiplyCore functions
@@ -558,7 +579,7 @@
     {
         if(_data_layout == DataLayout::NCHW)
         {
-            NEScheduler::get().schedule(&_col2im_kernel, Window::DimY);
+            NEScheduler::get().schedule(_col2im_kernel.get(), Window::DimY);
         }
         else
         {
diff --git a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
index ad306c3..70fdcf4 100644
--- a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
+++ b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h"
 
-#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
index 6d52f2b..09637dd 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
@@ -26,17 +26,19 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEGEMMLowpAssemblyMatrixMultiplyCore::~NEGEMMLowpAssemblyMatrixMultiplyCore() = default;
 
 NEGEMMLowpAssemblyMatrixMultiplyCore::NEGEMMLowpAssemblyMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b()
@@ -137,3 +139,4 @@
         NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
     }
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 36357dd..9050427 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -34,12 +34,23 @@
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "src/core/helpers/AutoConfiguration.h"
+
+#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+
 #include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 using namespace arm_compute::misc::shape_calculator;
 
+NEGEMMLowpMatrixMultiplyCore::~NEGEMMLowpMatrixMultiplyCore() = default;
+
 NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
     : _memory_group(memory_manager), _weights_manager(weights_manager), _asm_glue(memory_manager, weights_manager), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(),
       _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(), _convert_to_signed_asymm(),
@@ -80,7 +91,8 @@
 
         _signed_a.allocator()->init(a_to_use->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)));
         _memory_group.manage(&_signed_a);
-        _convert_to_signed_asymm.configure(a_to_use, &_signed_a);
+        _convert_to_signed_asymm = arm_compute::support::cpp14::make_unique<NEConvertQuantizedSignednessKernel>();
+        _convert_to_signed_asymm->configure(a_to_use, &_signed_a);
         a_to_use  = &_signed_a;
         _a_offset = _signed_a.info()->quantization_info().uniform().offset;
 
@@ -153,10 +165,12 @@
         }
 
         // Configure interleave kernel
-        _mtx_a_reshape_kernel.configure(a_to_use, &_tmp_a);
+        _mtx_a_reshape_kernel = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+        _mtx_a_reshape_kernel->configure(a_to_use, &_tmp_a);
 
         // Configure transpose kernel
-        _mtx_b_reshape_kernel.configure(b, &_tmp_b);
+        _mtx_b_reshape_kernel = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+        _mtx_b_reshape_kernel->configure(b, &_tmp_b);
     }
 
     if(!_fused_assembly_path)
@@ -176,7 +190,8 @@
             }
 
             // Configure Matrix B reduction kernel
-            _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, reduction_info);
+            _mtx_b_reduction_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixBReductionKernel>();
+            _mtx_b_reduction_kernel->configure(b, &_vector_sum_col, reduction_info);
         }
 
         // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
@@ -188,7 +203,8 @@
             _memory_group.manage(&_vector_sum_row);
 
             // Configure matrix A reduction kernel
-            _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, reduction_info);
+            _mtx_a_reduction_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixAReductionKernel>();
+            _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info);
         }
 
         if(_fuse_output_stage)
@@ -196,19 +212,22 @@
             // Configure matrix multiply kernel
             if(!_assembly_path)
             {
-                _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32);
+                _mm_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+                _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32);
             }
 
-            _offset_contribution_output_stage_kernel.configure(&_mm_result_s32,
-                                                               _a_offset == 0 ? nullptr : &_vector_sum_col,
-                                                               _b_offset == 0 ? nullptr : &_vector_sum_row, c,
-                                                               _flip_signedness ? &_signed_output : output,
-                                                               a->info()->dimension(0),
-                                                               _a_offset, _b_offset, info.gemmlowp_output_stage());
+            _offset_contribution_output_stage_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpOffsetContributionOutputStageKernel>();
+            _offset_contribution_output_stage_kernel->configure(&_mm_result_s32,
+                                                                _a_offset == 0 ? nullptr : &_vector_sum_col,
+                                                                _b_offset == 0 ? nullptr : &_vector_sum_row, c,
+                                                                _flip_signedness ? &_signed_output : output,
+                                                                a->info()->dimension(0),
+                                                                _a_offset, _b_offset, info.gemmlowp_output_stage());
 
             if(_flip_signedness)
             {
-                _convert_from_signed_asymm.configure(&_signed_output, output);
+                _convert_from_signed_asymm = arm_compute::support::cpp14::make_unique<NEConvertQuantizedSignednessKernel>();
+                _convert_from_signed_asymm->configure(&_signed_output, output);
             }
         }
         else
@@ -216,10 +235,12 @@
             // Configure matrix multiply kernel
             if(!_assembly_path)
             {
-                _mm_kernel.configure(matrix_a, matrix_b, output);
+                _mm_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+                _mm_kernel->configure(matrix_a, matrix_b, output);
             }
             // Configure offset contribution kernel
-            _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset);
+            _offset_contribution_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpOffsetContributionKernel>();
+            _offset_contribution_kernel->configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset);
         }
 
         // Configure activation
@@ -468,7 +489,7 @@
     // Convert QASYMM8->QASYMM8_SIGNED
     if(_flip_signedness)
     {
-        NEScheduler::get().schedule(&_convert_to_signed_asymm, Window::DimY);
+        NEScheduler::get().schedule(_convert_to_signed_asymm.get(), Window::DimY);
     }
 
     // Run GEMM
@@ -481,15 +502,15 @@
         if(!_run_vector_matrix_multiplication)
         {
             // Run interleave kernel
-            NEScheduler::get().schedule(&_mtx_a_reshape_kernel, Window::DimY);
+            NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
 
             if(!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
-                NEScheduler::get().schedule(&_mtx_b_reshape_kernel, Window::DimY);
+                NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
             }
         }
-        NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
+        NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
     }
 
     if(!_fused_assembly_path)
@@ -497,31 +518,31 @@
         // Run matrix A reduction kernel only if _b_offset is not equal to 0
         if(_b_offset != 0)
         {
-            NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);
+            NEScheduler::get().schedule(_mtx_a_reduction_kernel.get(), Window::DimX);
         }
 
         // Run matrix B reduction kernel only if _a_offset is not equal to 0
         if(_a_offset != 0 && !_reshape_b_only_on_first_run)
         {
-            NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+            NEScheduler::get().schedule(_mtx_b_reduction_kernel.get(), Window::DimX);
         }
 
         if(_fuse_output_stage)
         {
             // Run offset contribution kernel
-            NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY);
+            NEScheduler::get().schedule(_offset_contribution_output_stage_kernel.get(), Window::DimY);
         }
         else
         {
             // Run offset contribution kernel
-            NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
+            NEScheduler::get().schedule(_offset_contribution_kernel.get(), Window::DimY);
         }
     }
 
     // Convert QASYMM8_SIGNED->QASYMM8
-    if(_flip_signedness)
+    if(!_fused_assembly_path && _fuse_output_stage && _flip_signedness)
     {
-        NEScheduler::get().schedule(&_convert_from_signed_asymm, Window::DimY);
+        NEScheduler::get().schedule(_convert_from_signed_asymm.get(), Window::DimY);
     }
 
     // Run fused activation unless already run in the fused assembly
@@ -560,7 +581,7 @@
 
             // Run reshape kernel and mark original weights tensor as unused
             _tmp_b.allocator()->allocate();
-            NEScheduler::get().schedule(&_mtx_b_reshape_kernel, Window::DimY);
+            NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
             if(!original_b_managed_by_weights_manager)
             {
                 _original_b->mark_as_unused();
@@ -571,7 +592,7 @@
         if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
         {
             _vector_sum_col.allocator()->allocate();
-            NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+            NEScheduler::get().schedule(_mtx_b_reduction_kernel.get(), Window::DimX);
         }
 
         _is_prepared = true;
diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
index 239a8e6..9fb8851 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
@@ -24,15 +24,17 @@
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::~NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint() = default;
+
 void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift,
                                                                     int result_offset_after_shift, int min, int max)
 {
@@ -46,6 +48,8 @@
     return NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, min, max);
 }
 
+NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::~NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint() = default;
+
 void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift,
                                                                    int result_offset_after_shift, int min, int max)
 {
@@ -59,6 +63,8 @@
     return NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(input, bias, output, min, max);
 }
 
+NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::~NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint() = default;
+
 void NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int min, int max)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>();
@@ -71,6 +77,8 @@
     return NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(input, bias, output, min, max);
 }
 
+NEGEMMLowpOutputStage::~NEGEMMLowpOutputStage() = default;
+
 void NEGEMMLowpOutputStage::configure(const ITensor *input, const ITensor *bias, ITensor *output, const GEMMLowpOutputStageInfo &info)
 {
     // Perform validate step
diff --git a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
index e807e86..90cf0ba 100644
--- a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
+++ b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
@@ -25,9 +25,9 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGather.cpp b/src/runtime/NEON/functions/NEGather.cpp
index 5238936..5c0dae1 100644
--- a/src/runtime/NEON/functions/NEGather.cpp
+++ b/src/runtime/NEON/functions/NEGather.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEGather.h"
 
-#include "arm_compute/core/NEON/kernels/NEGatherKernel.h"
+#include "src/core/NEON/kernels/NEGatherKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEGaussian3x3.cpp b/src/runtime/NEON/functions/NEGaussian3x3.cpp
index fba49ed..5290de1 100644
--- a/src/runtime/NEON/functions/NEGaussian3x3.cpp
+++ b/src/runtime/NEON/functions/NEGaussian3x3.cpp
@@ -23,18 +23,23 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEGaussian3x3.h"
 
-#include "arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEGaussian3x3Kernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void NEGaussian3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEGaussian3x3Kernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler = std::move(b);
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEGaussian5x5.cpp b/src/runtime/NEON/functions/NEGaussian5x5.cpp
index 99591f4..7857710 100644
--- a/src/runtime/NEON/functions/NEGaussian5x5.cpp
+++ b/src/runtime/NEON/functions/NEGaussian5x5.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,13 +24,17 @@
 #include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEGaussian5x5Kernel.h"
+#include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEGaussian5x5::~NEGaussian5x5() = default;
 
 NEGaussian5x5::NEGaussian5x5(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _kernel_hor(), _kernel_vert(), _tmp(), _border_handler()
@@ -46,21 +50,26 @@
     // Manage intermediate buffers
     _memory_group.manage(&_tmp);
 
+    _kernel_hor     = arm_compute::support::cpp14::make_unique<NEGaussian5x5HorKernel>();
+    _kernel_vert    = arm_compute::support::cpp14::make_unique<NEGaussian5x5VertKernel>();
+    _border_handler = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+
     // Create and configure kernels for the two passes
-    _kernel_hor.configure(input, &_tmp, border_mode == BorderMode::UNDEFINED);
-    _kernel_vert.configure(&_tmp, output, border_mode == BorderMode::UNDEFINED);
+    _kernel_hor->configure(input, &_tmp, border_mode == BorderMode::UNDEFINED);
+    _kernel_vert->configure(&_tmp, output, border_mode == BorderMode::UNDEFINED);
 
     _tmp.allocator()->allocate();
 
-    _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(input, _kernel_hor->border_size(), border_mode, PixelValue(constant_border_value));
 }
 
 void NEGaussian5x5::run()
 {
-    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+    NEScheduler::get().schedule(_border_handler.get(), Window::DimZ);
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
-    NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
+    NEScheduler::get().schedule(_kernel_hor.get(), Window::DimY);
+    NEScheduler::get().schedule(_kernel_vert.get(), Window::DimY);
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEGaussianPyramid.cpp b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
index e4e20e0..30fe70f 100644
--- a/src/runtime/NEON/functions/NEGaussianPyramid.cpp
+++ b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
@@ -25,16 +25,18 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h"
-#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
 #include "arm_compute/runtime/Pyramid.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEGaussian5x5Kernel.h"
+#include "src/core/NEON/kernels/NEGaussianPyramidKernel.h"
+#include "src/core/NEON/kernels/NEScaleKernel.h"
+#include "support/MemorySupport.h"
 
 #include <cstddef>
 
@@ -45,6 +47,8 @@
 {
 }
 
+NEGaussianPyramidHalf::~NEGaussianPyramidHalf() = default;
+
 NEGaussianPyramidHalf::NEGaussianPyramidHalf() // NOLINT
     : _horizontal_border_handler(),
       _vertical_border_handler(),
@@ -94,16 +98,20 @@
         for(size_t i = 0; i < num_stages; ++i)
         {
             /* Configure horizontal kernel */
-            _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
+            _horizontal_reduction[i] = arm_compute::support::cpp14::make_unique<NEGaussianPyramidHorKernel>();
+            _horizontal_reduction[i]->configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
 
             /* Configure vertical kernel */
-            _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
+            _vertical_reduction[i] = arm_compute::support::cpp14::make_unique<NEGaussianPyramidVertKernel>();
+            _vertical_reduction[i]->configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
 
             /* Configure border */
-            _horizontal_border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+            _horizontal_border_handler[i] = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+            _horizontal_border_handler[i]->configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i]->border_size(), border_mode, PixelValue(constant_border_value));
 
             /* Configure border */
-            _vertical_border_handler[i].configure(_tmp.get_pyramid_level(i), _vertical_reduction[i].border_size(), border_mode, PixelValue(pixel_value_u16));
+            _vertical_border_handler[i] = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+            _vertical_border_handler[i]->configure(_tmp.get_pyramid_level(i), _vertical_reduction[i]->border_size(), border_mode, PixelValue(pixel_value_u16));
         }
 
         _tmp.allocate();
@@ -122,13 +130,15 @@
 
     for(unsigned int i = 0; i < num_levels - 1; ++i)
     {
-        NEScheduler::get().schedule(&_horizontal_border_handler[i], Window::DimZ);
-        NEScheduler::get().schedule(&_horizontal_reduction[i], Window::DimY);
-        NEScheduler::get().schedule(&_vertical_border_handler[i], Window::DimZ);
-        NEScheduler::get().schedule(&_vertical_reduction[i], Window::DimY);
+        NEScheduler::get().schedule(_horizontal_border_handler[i].get(), Window::DimZ);
+        NEScheduler::get().schedule(_horizontal_reduction[i].get(), Window::DimY);
+        NEScheduler::get().schedule(_vertical_border_handler[i].get(), Window::DimZ);
+        NEScheduler::get().schedule(_vertical_reduction[i].get(), Window::DimY);
     }
 }
 
+NEGaussianPyramidOrb::~NEGaussianPyramidOrb() = default;
+
 NEGaussianPyramidOrb::NEGaussianPyramidOrb() // NOLINT
     : _gaus5x5(),
       _scale_nearest()
diff --git a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
index 13210a0..d9a498e 100644
--- a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
+++ b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
@@ -25,19 +25,22 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NECopyKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEPadLayerKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
 {
 NEGenerateProposalsLayer::NEGenerateProposalsLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager),
-      _permute_deltas_kernel(),
+      _permute_deltas(),
       _flatten_deltas(),
-      _permute_scores_kernel(),
+      _permute_scores(),
       _flatten_scores(),
-      _compute_anchors_kernel(),
-      _bounding_box_kernel(),
-      _pad_kernel(),
+      _compute_anchors(),
+      _bounding_box(),
+      _pad(),
       _dequantize_anchors(),
       _dequantize_deltas(),
       _quantize_all_proposals(),
@@ -62,6 +65,8 @@
 {
 }
 
+NEGenerateProposalsLayer::~NEGenerateProposalsLayer() = default;
+
 void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *deltas, const ITensor *anchors, ITensor *proposals, ITensor *scores_out, ITensor *num_valid_proposals,
                                          const GenerateProposalsInfo &info)
 {
@@ -85,7 +90,7 @@
 
     // Compute all the anchors
     _memory_group.manage(&_all_anchors);
-    _compute_anchors_kernel.configure(anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
+    _compute_anchors.configure(anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
 
     const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors);
     _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
@@ -95,7 +100,7 @@
     if(!_is_nhwc)
     {
         _memory_group.manage(&_deltas_permuted);
-        _permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
+        _permute_deltas.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
         _flatten_deltas.configure(&_deltas_permuted, &_deltas_flattened);
         _deltas_permuted.allocator()->allocate();
     }
@@ -112,7 +117,7 @@
     if(!_is_nhwc)
     {
         _memory_group.manage(&_scores_permuted);
-        _permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
+        _permute_scores.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
         _flatten_scores.configure(&_scores_permuted, &_scores_flattened);
         _scores_permuted.allocator()->allocate();
     }
@@ -141,7 +146,7 @@
     // Bounding box transform
     _memory_group.manage(&_all_proposals);
     BoundingBoxTransformInfo bbox_info(info.im_width(), info.im_height(), 1.f);
-    _bounding_box_kernel.configure(anchors_to_use, &_all_proposals, deltas_to_use, bbox_info);
+    _bounding_box.configure(anchors_to_use, &_all_proposals, deltas_to_use, bbox_info);
     deltas_to_use->allocator()->allocate();
     anchors_to_use->allocator()->allocate();
 
@@ -197,7 +202,7 @@
     _scores_flattened.allocator()->allocate();
 
     // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images
-    _pad_kernel.configure(&_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
+    _pad.configure(&_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
     _proposals_4_roi_values.allocator()->allocate();
 }
 
@@ -229,7 +234,7 @@
     }
 
     TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchors::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
 
     TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true);
     TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
@@ -240,8 +245,8 @@
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPermuteKernel::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPermuteKernel::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
     }
 
     TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
@@ -258,25 +263,25 @@
     if(is_qasymm8)
     {
         TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayerKernel::validate(&all_anchors_info, &all_anchors_f32_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(&all_anchors_info, &all_anchors_f32_info));
 
         TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayerKernel::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
 
         TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransformKernel::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
-                                                                           BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
+                                                                     BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayerKernel::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
         proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized;
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
-                                                                           BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
+                                                                     BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } }));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayer::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } }));
 
     if(num_valid_proposals->total_size() > 0)
     {
@@ -319,13 +324,13 @@
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Compute all the anchors
-    NEScheduler::get().schedule(&_compute_anchors_kernel, Window::DimY);
+    _compute_anchors.run();
 
     // Transpose and reshape the inputs
     if(!_is_nhwc)
     {
-        NEScheduler::get().schedule(&_permute_deltas_kernel, Window::DimY);
-        NEScheduler::get().schedule(&_permute_scores_kernel, Window::DimY);
+        _permute_deltas.run();
+        _permute_scores.run();
     }
 
     _flatten_deltas.run();
@@ -333,22 +338,22 @@
 
     if(_is_qasymm8)
     {
-        NEScheduler::get().schedule(&_dequantize_anchors, Window::DimY);
-        NEScheduler::get().schedule(&_dequantize_deltas, Window::DimY);
+        _dequantize_anchors.run();
+        _dequantize_deltas.run();
     }
 
     // Build the boxes
-    NEScheduler::get().schedule(&_bounding_box_kernel, Window::DimY);
+    _bounding_box.run();
 
     if(_is_qasymm8)
     {
-        NEScheduler::get().schedule(&_quantize_all_proposals, Window::DimY);
+        _quantize_all_proposals.run();
     }
 
     // Non maxima suppression
     _cpp_nms.run();
 
     // Add dummy batch indexes
-    NEScheduler::get().schedule(&_pad_kernel, Window::DimY);
+    _pad.run();
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEHOGDescriptor.cpp b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
index 10765f9..689e64f 100644
--- a/src/runtime/NEON/functions/NEHOGDescriptor.cpp
+++ b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,8 +28,14 @@
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEDerivativeKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEHOGDescriptorKernel.h"
+#include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEHOGDescriptor::~NEHOGDescriptor() = default;
 
 NEHOGDescriptor::NEHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
@@ -82,10 +88,12 @@
     _memory_group.manage(&_hog_space);
 
     // Initialise orientation binning kernel
-    _orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info());
+    _orient_bin = arm_compute::support::cpp14::make_unique<NEHOGOrientationBinningKernel>();
+    _orient_bin->configure(&_mag, &_phase, &_hog_space, hog->info());
 
     // Initialize HOG norm kernel
-    _block_norm.configure(&_hog_space, output, hog->info());
+    _block_norm = arm_compute::support::cpp14::make_unique<NEHOGBlockNormalizationKernel>();
+    _block_norm->configure(&_hog_space, output, hog->info());
 
     // Allocate intermediate tensors
     _mag.allocator()->allocate();
@@ -101,8 +109,9 @@
     _gradient.run();
 
     // Run orientation binning kernel
-    NEScheduler::get().schedule(&_orient_bin, Window::DimY);
+    NEScheduler::get().schedule(_orient_bin.get(), Window::DimY);
 
     // Run block normalization kernel
-    NEScheduler::get().schedule(&_block_norm, Window::DimY);
+    NEScheduler::get().schedule(_block_norm.get(), Window::DimY);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEHOGDetector.cpp b/src/runtime/NEON/functions/NEHOGDetector.cpp
index 21db5f8..8468b75 100644
--- a/src/runtime/NEON/functions/NEHOGDetector.cpp
+++ b/src/runtime/NEON/functions/NEHOGDetector.cpp
@@ -23,10 +23,12 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEHOGDetector.h"
 
-#include "arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h"
+#include "src/core/NEON/kernels/NEHOGDetectorKernel.h"
 #include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEHOGDetector::~NEHOGDetector() = default;
 
 void NEHOGDetector::configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class)
 {
@@ -34,3 +36,4 @@
     k->configure(input, hog, detection_windows, detection_window_stride, threshold, idx_class);
     _kernel = std::move(k);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEHOGGradient.cpp b/src/runtime/NEON/functions/NEHOGGradient.cpp
index 8f3559a..7d794bc 100644
--- a/src/runtime/NEON/functions/NEHOGGradient.cpp
+++ b/src/runtime/NEON/functions/NEHOGGradient.cpp
@@ -23,12 +23,16 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEHOGGradient.h"
 
-#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEDerivativeKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEMagnitudePhaseKernel.h"
 #include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEHOGGradient::~NEHOGGradient() = default;
 
 NEHOGGradient::NEHOGGradient(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
     : _memory_group(std::move(memory_manager)),
@@ -88,3 +92,4 @@
     // Run magnitude/phase kernel
     NEScheduler::get().schedule(_mag_phase.get(), Window::DimY);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
index e08b699..3e41faa 100644
--- a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
+++ b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
@@ -28,8 +28,13 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/Tensor.h"
+#include "src/core/NEON/kernels/NEDerivativeKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEHOGDescriptorKernel.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEHOGMultiDetection::~NEHOGMultiDetection() = default;
 
 NEHOGMultiDetection::NEHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
     : _memory_group(std::move(memory_manager)),
@@ -262,3 +267,4 @@
         NEScheduler::get().schedule(&_non_maxima_kernel, Window::DimY);
     }
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEHarrisCorners.cpp b/src/runtime/NEON/functions/NEHarrisCorners.cpp
index 3c51eb2..23fcf8c 100644
--- a/src/runtime/NEON/functions/NEHarrisCorners.cpp
+++ b/src/runtime/NEON/functions/NEHarrisCorners.cpp
@@ -24,8 +24,6 @@
 #include "arm_compute/runtime/NEON/functions/NEHarrisCorners.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Array.h"
@@ -34,12 +32,19 @@
 #include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
 #include "arm_compute/runtime/NEON/functions/NESobel7x7.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEHarrisCornersKernel.h"
+#include "src/core/NEON/kernels/NESobel5x5Kernel.h"
+#include "src/core/NEON/kernels/NESobel7x7Kernel.h"
 #include "support/MemorySupport.h"
 
 #include <cmath>
 #include <utility>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEHarrisCorners::~NEHarrisCorners() = default;
 
 NEHarrisCorners::NEHarrisCorners(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
     : _memory_group(std::move(memory_manager)),
@@ -154,8 +159,10 @@
     }
 
     // Configure border filling before harris score
-    _border_gx.configure(&_gx, _harris_score->border_size(), border_mode, constant_border_value);
-    _border_gy.configure(&_gy, _harris_score->border_size(), border_mode, constant_border_value);
+    _border_gx = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    _border_gy = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    _border_gx->configure(&_gx, _harris_score->border_size(), border_mode, constant_border_value);
+    _border_gy->configure(&_gy, _harris_score->border_size(), border_mode, constant_border_value);
 
     // Allocate once all the configure methods have been called
     _gx.allocator()->allocate();
@@ -193,8 +200,8 @@
     _sobel->run();
 
     // Fill border before harris score kernel
-    NEScheduler::get().schedule(&_border_gx, Window::DimZ);
-    NEScheduler::get().schedule(&_border_gy, Window::DimZ);
+    NEScheduler::get().schedule(_border_gx.get(), Window::DimZ);
+    NEScheduler::get().schedule(_border_gy.get(), Window::DimZ);
 
     // Run harris score kernel
     NEScheduler::get().schedule(_harris_score.get(), Window::DimY);
@@ -208,3 +215,4 @@
     // Run sort & euclidean distance
     NEScheduler::get().schedule(&_sort_euclidean, Window::DimY);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEHistogram.cpp b/src/runtime/NEON/functions/NEHistogram.cpp
index 39fad97..40ea3a1 100644
--- a/src/runtime/NEON/functions/NEHistogram.cpp
+++ b/src/runtime/NEON/functions/NEHistogram.cpp
@@ -29,8 +29,12 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEHistogramKernel.h"
+#include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEHistogram::~NEHistogram() = default;
 
 NEHistogram::NEHistogram()
     : _histogram_kernel(), _local_hist(), _window_lut(window_lut_default_size), _local_hist_size(0)
@@ -47,11 +51,13 @@
     _local_hist.resize(_local_hist_size);
 
     // Configure kernel
-    _histogram_kernel.configure(input, output, _local_hist.data(), _window_lut.data());
+    _histogram_kernel = arm_compute::support::cpp14::make_unique<NEHistogramKernel>();
+    _histogram_kernel->configure(input, output, _local_hist.data(), _window_lut.data());
 }
 
 void NEHistogram::run()
 {
     // Calculate histogram of input.
-    NEScheduler::get().schedule(&_histogram_kernel, Window::DimY);
+    NEScheduler::get().schedule(_histogram_kernel.get(), Window::DimY);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEIm2Col.cpp b/src/runtime/NEON/functions/NEIm2Col.cpp
index 99e5d3f..bc0c601 100644
--- a/src/runtime/NEON/functions/NEIm2Col.cpp
+++ b/src/runtime/NEON/functions/NEIm2Col.cpp
@@ -25,9 +25,13 @@
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEIm2ColKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NEIm2Col::~NEIm2Col() = default;
+
 NEIm2Col::NEIm2Col()
     : _kernel(), _y_dim(1)
 {
@@ -37,7 +41,8 @@
 {
     _y_dim = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
 
-    _kernel.configure(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups);
+    _kernel = arm_compute::support::cpp14::make_unique<NEIm2ColKernel>();
+    _kernel->configure(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups);
 }
 
 Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
@@ -48,6 +53,6 @@
 
 void NEIm2Col::run()
 {
-    NEScheduler::get().schedule(&_kernel, _y_dim);
+    NEScheduler::get().schedule(_kernel.get(), _y_dim);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
index 57d01ff..e3fb284 100644
--- a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
@@ -26,9 +26,13 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NEInstanceNormalizationLayer::~NEInstanceNormalizationLayer() = default;
+
 NEInstanceNormalizationLayer::NEInstanceNormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false), _permute_input(), _permute_output(), _permuted_input(), _permuted_output()
 {
@@ -42,6 +46,8 @@
     // Configure Kernels
     _is_nchw = data_layout == DataLayout::NCHW;
 
+    _normalization_kernel = arm_compute::support::cpp14::make_unique<NEInstanceNormalizationLayerKernel>();
+
     if(!_is_nchw)
     {
         _memory_group.manage(&_permuted_input);
@@ -51,7 +57,7 @@
         _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
         _permuted_input.info()->set_data_layout(DataLayout::NCHW);
 
-        _normalization_kernel.configure(&_permuted_input, &_permuted_output, kernel_descriptor);
+        _normalization_kernel->configure(&_permuted_input, &_permuted_output, kernel_descriptor);
         _permuted_output.info()->set_data_layout(DataLayout::NCHW);
 
         _permute_output.configure(&_permuted_output, output != nullptr ? output : input, PermutationVector(2U, 0U, 1U));
@@ -60,7 +66,7 @@
     }
     else
     {
-        _normalization_kernel.configure(input, output, kernel_descriptor);
+        _normalization_kernel->configure(input, output, kernel_descriptor);
     }
 }
 
@@ -81,7 +87,7 @@
         _permute_input.run();
     }
 
-    NEScheduler::get().schedule(&_normalization_kernel, Window::DimZ);
+    NEScheduler::get().schedule(_normalization_kernel.get(), Window::DimZ);
 
     // Permute output
     if(!_is_nchw)
diff --git a/src/runtime/NEON/functions/NEIntegralImage.cpp b/src/runtime/NEON/functions/NEIntegralImage.cpp
index 8ab6bbd..63bcd53 100644
--- a/src/runtime/NEON/functions/NEIntegralImage.cpp
+++ b/src/runtime/NEON/functions/NEIntegralImage.cpp
@@ -23,18 +23,25 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEIntegralImage.h"
 
-#include "arm_compute/core/NEON/kernels/NEIntegralImageKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEIntegralImageKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEIntegralImage::~NEIntegralImage() = default;
 
 void NEIntegralImage::configure(const ITensor *input, ITensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEIntegralImageKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
-    _border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
+
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(output, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
+    _border_handler = std::move(b);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
index 04cf3a2..4a99968 100644
--- a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
+++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,9 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
@@ -32,6 +35,7 @@
 {
 constexpr int max_input_tensor_dim = 3;
 } // namespace
+NEL2NormalizeLayer::~NEL2NormalizeLayer() = default;
 
 NEL2NormalizeLayer::NEL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
@@ -46,7 +50,8 @@
     // Configure Kernels
     const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
     _reduce_func.configure(input, &_sumsq, actual_axis, ReductionOperation::SUM_SQUARE);
-    _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon);
+    _normalize_kernel = arm_compute::support::cpp14::make_unique<NEL2NormalizeLayerKernel>();
+    _normalize_kernel->configure(input, &_sumsq, output, axis, epsilon);
 
     // Allocate intermediate tensors
     _sumsq.allocator()->allocate();
@@ -78,6 +83,6 @@
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     _reduce_func.run();
-    NEScheduler::get().schedule(&_normalize_kernel, Window::DimY);
+    NEScheduler::get().schedule(_normalize_kernel.get(), Window::DimY);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp
index dca274a..48d69bd 100644
--- a/src/runtime/NEON/functions/NELSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayer.cpp
@@ -29,12 +29,24 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/common/LSTMParams.h"
+#include "src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
+#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 
 namespace arm_compute
 {
 using namespace arm_compute::misc::shape_calculator;
 using namespace arm_compute::utils::info_helpers;
 
+NELSTMLayer::~NELSTMLayer() = default;
+
 NELSTMLayer::NELSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(),
       _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _transpose_cell_state(),
@@ -575,8 +587,8 @@
     }
 
     // Validate copy kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(&cell_state_tmp, cell_state_out));
-    ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(output_state_out, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(NECopy::validate(&cell_state_tmp, cell_state_out));
+    ARM_COMPUTE_RETURN_ON_ERROR(NECopy::validate(output_state_out, output));
 
     // Validate scratch concatenation
     std::vector<const ITensorInfo *> inputs_vector_info_raw;
@@ -646,7 +658,7 @@
     }
 
     _fully_connected_cell_state.run();
-    NEScheduler::get().schedule(&_transpose_cell_state, Window::DimY);
+    _transpose_cell_state.run();
     _gemm_cell_state1.run();
     _accum_cell_state1.run();
     if(_is_layer_norm_lstm)
@@ -691,8 +703,8 @@
         }
     }
 
-    NEScheduler::get().schedule(&_copy_cell_state, Window::DimY);
-    NEScheduler::get().schedule(&_copy_output, Window::DimY);
+    _copy_cell_state.run();
+    _copy_output.run();
 
     _concat_scratch_buffer.run();
 }
diff --git a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
index 7610d15..e439293 100644
--- a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
@@ -26,6 +26,16 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
+#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
 #include <cmath>
@@ -42,6 +52,7 @@
 const QuantizationInfo qsymm_4(16.f / 32768.f, 0); // qsymm16 with 4 integer bit
 const QuantizationInfo qsymm_0(1.f / 32768.f, 0);  // qsymm16 with 0 integer bit
 } // namespace
+NELSTMLayerQuantized::~NELSTMLayerQuantized() = default;
 
 NELSTMLayerQuantized::NELSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(),
diff --git a/src/runtime/NEON/functions/NELaplacianPyramid.cpp b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
index 4f0639b..a2651db 100644
--- a/src/runtime/NEON/functions/NELaplacianPyramid.cpp
+++ b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
@@ -29,11 +29,15 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
 #include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
 #include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
 #include "arm_compute/runtime/Tensor.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEGaussian5x5Kernel.h"
+#include "src/core/NEON/kernels/NEGaussianPyramidKernel.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NELaplacianPyramid::~NELaplacianPyramid() = default;
 
 NELaplacianPyramid::NELaplacianPyramid() // NOLINT
     : _num_levels(0),
@@ -105,3 +109,4 @@
     _gauss_pyr.allocate();
     _conv_pyr.allocate();
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
index aa5f8a2..a50e7cc 100644
--- a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
+++ b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h"
 
+#include "arm_compute/core/CPP/ICPPKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/IPyramid.h"
 #include "arm_compute/core/ITensor.h"
@@ -31,7 +32,9 @@
 
 #include <cstddef>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NELaplacianReconstruct::~NELaplacianReconstruct() = default;
 
 NELaplacianReconstruct::NELaplacianReconstruct() // NOLINT
     : _tmp_pyr(),
@@ -100,3 +103,4 @@
 
     _depthf.run();
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
index af502be..131ac82 100644
--- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,12 +27,16 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEIm2ColKernel.h"
+#include "src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
+#include "support/MemorySupport.h"
 
 #include <cmath>
 #include <tuple>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
@@ -70,9 +74,10 @@
     shape_gemm.set(1, mat_input_rows);
 }
 } // namespace
+NELocallyConnectedLayer::~NELocallyConnectedLayer() = default;
 
 NELocallyConnectedLayer::NELocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
+    : _memory_group(std::move(memory_manager)), _input_im2col(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
       _is_prepared(false), _original_weights(nullptr)
 {
 }
@@ -113,10 +118,10 @@
     TensorInfo input_im2col_reshaped_info(shape_im2col, 1, input->data_type());
     TensorInfo gemm_output_info(shape_gemm, 1, input->data_type());
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEIm2Col::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias));
     ARM_COMPUTE_RETURN_ON_ERROR(NEWeightsReshapeKernel::validate(weights, biases, &weights_reshaped_info));
     ARM_COMPUTE_RETURN_ON_ERROR(NELocallyConnectedMatrixMultiplyKernel::validate(&input_im2col_reshaped_info, &weights_reshaped_info, &gemm_output_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(&gemm_output_info, output, Size2D(conv_w, conv_h)));
+    ARM_COMPUTE_RETURN_ON_ERROR(NECol2Im::validate(&gemm_output_info, output, Size2D(conv_w, conv_h)));
 
     return Status{};
 }
@@ -154,10 +159,12 @@
     _memory_group.manage(&_gemm_output);
 
     // Configure kernels
-    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
-    _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
-    _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
-    _output_col2im_kernel.configure(&_gemm_output, output, Size2D(conv_w, conv_h));
+    _input_im2col.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
+    _weights_reshape_kernel = arm_compute::support::cpp14::make_unique<NEWeightsReshapeKernel>();
+    _weights_reshape_kernel->configure(weights, biases, &_weights_reshaped);
+    _mm_kernel = arm_compute::support::cpp14::make_unique<NELocallyConnectedMatrixMultiplyKernel>();
+    _mm_kernel->configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
+    _output_col2im.configure(&_gemm_output, output, Size2D(conv_w, conv_h));
 
     // Allocate intermediate tensors
     _input_im2col_reshaped.allocator()->allocate();
@@ -171,13 +178,13 @@
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run input reshaping
-    NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
+    _input_im2col.run();
 
     // Runs GEMM on reshaped matrices
-    NEScheduler::get().schedule(&_mm_kernel, Window::DimX);
+    NEScheduler::get().schedule(_mm_kernel.get(), Window::DimX);
 
     // Reshape output matrix
-    NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+    _output_col2im.run();
 }
 
 void NELocallyConnectedLayer::prepare()
@@ -188,9 +195,10 @@
 
         // Run weights reshaping and mark original weights tensor as unused
         _weights_reshaped.allocator()->allocate();
-        NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+        NEScheduler::get().schedule(_weights_reshape_kernel.get(), 3);
         _original_weights->mark_as_unused();
 
         _is_prepared = true;
     }
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEMagnitude.cpp b/src/runtime/NEON/functions/NEMagnitude.cpp
index 5ca672e..06ed8d4 100644
--- a/src/runtime/NEON/functions/NEMagnitude.cpp
+++ b/src/runtime/NEON/functions/NEMagnitude.cpp
@@ -23,13 +23,15 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEMagnitude.h"
 
-#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/kernels/NEMagnitudePhaseKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEMagnitude::~NEMagnitude() = default;
 
 void NEMagnitude::configure(const ITensor *input1, const ITensor *input2, ITensor *output, MagnitudeType mag_type)
 {
@@ -46,3 +48,4 @@
         _kernel = std::move(k);
     }
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp
index 9d3f34f..e8c9d09 100644
--- a/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp
@@ -25,9 +25,14 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h"
+#include "src/core/NEON/kernels/NEMemsetKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NEMaxUnpoolingLayer::~NEMaxUnpoolingLayer() = default;
+
 NEMaxUnpoolingLayer::NEMaxUnpoolingLayer()
 
     : _memset_kernel(), _unpooling_layer_kernel()
@@ -37,8 +42,10 @@
 void NEMaxUnpoolingLayer::configure(ITensor *input, ITensor *indices, ITensor *output, const PoolingLayerInfo &pool_info)
 {
     const PixelValue zero_value(0.f);
-    _memset_kernel.configure(output, zero_value);
-    _unpooling_layer_kernel.configure(input, indices, output, pool_info);
+    _memset_kernel          = arm_compute::support::cpp14::make_unique<NEMemsetKernel>();
+    _unpooling_layer_kernel = arm_compute::support::cpp14::make_unique<NEMaxUnpoolingLayerKernel>();
+    _memset_kernel->configure(output, zero_value);
+    _unpooling_layer_kernel->configure(input, indices, output, pool_info);
 }
 
 Status NEMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
@@ -48,7 +55,7 @@
 
 void NEMaxUnpoolingLayer::run()
 {
-    NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
-    NEScheduler::get().schedule(&_unpooling_layer_kernel, Window::DimY);
+    NEScheduler::get().schedule(_memset_kernel.get(), Window::DimY);
+    NEScheduler::get().schedule(_unpooling_layer_kernel.get(), Window::DimY);
 }
 } /* namespace arm_compute */
diff --git a/src/runtime/NEON/functions/NEMeanStdDev.cpp b/src/runtime/NEON/functions/NEMeanStdDev.cpp
index 57363f0..e073420 100644
--- a/src/runtime/NEON/functions/NEMeanStdDev.cpp
+++ b/src/runtime/NEON/functions/NEMeanStdDev.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,13 @@
 #include "arm_compute/runtime/NEON/functions/NEMeanStdDev.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEMeanStdDevKernel.h"
+#include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEMeanStdDev::~NEMeanStdDev() = default;
 
 NEMeanStdDev::NEMeanStdDev()
     : _mean_stddev_kernel(), _fill_border_kernel(), _global_sum(0), _global_sum_squared(0)
@@ -34,8 +39,11 @@
 
 void NEMeanStdDev::configure(IImage *input, float *mean, float *stddev)
 {
-    _mean_stddev_kernel.configure(input, mean, &_global_sum, stddev, &_global_sum_squared);
-    _fill_border_kernel.configure(input, _mean_stddev_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint8_t>(0)));
+    _mean_stddev_kernel = arm_compute::support::cpp14::make_unique<NEMeanStdDevKernel>();
+    _fill_border_kernel = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+
+    _mean_stddev_kernel->configure(input, mean, &_global_sum, stddev, &_global_sum_squared);
+    _fill_border_kernel->configure(input, _mean_stddev_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint8_t>(0)));
 }
 
 void NEMeanStdDev::run()
@@ -43,6 +51,7 @@
     _global_sum         = 0;
     _global_sum_squared = 0;
 
-    NEScheduler::get().schedule(&_fill_border_kernel, Window::DimZ);
-    NEScheduler::get().schedule(&_mean_stddev_kernel, Window::DimY);
+    NEScheduler::get().schedule(_fill_border_kernel.get(), Window::DimZ);
+    NEScheduler::get().schedule(_mean_stddev_kernel.get(), Window::DimY);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp
index a88732b..d128c44 100644
--- a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp
@@ -23,11 +23,13 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
+#include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NEMeanStdDevNormalizationLayer::~NEMeanStdDevNormalizationLayer() = default;
+
 void NEMeanStdDevNormalizationLayer::configure(ITensor *input, ITensor *output, float epsilon)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEMeanStdDevNormalizationKernel>();
diff --git a/src/runtime/NEON/functions/NEMedian3x3.cpp b/src/runtime/NEON/functions/NEMedian3x3.cpp
index 2bbe8d3..b7b7c2c 100644
--- a/src/runtime/NEON/functions/NEMedian3x3.cpp
+++ b/src/runtime/NEON/functions/NEMedian3x3.cpp
@@ -23,18 +23,23 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEMedian3x3.h"
 
-#include "arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEMedian3x3Kernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void NEMedian3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEMedian3x3Kernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler = std::move(b);
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEMinMaxLocation.cpp b/src/runtime/NEON/functions/NEMinMaxLocation.cpp
index ca63937..3c2219c 100644
--- a/src/runtime/NEON/functions/NEMinMaxLocation.cpp
+++ b/src/runtime/NEON/functions/NEMinMaxLocation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,12 @@
 #include "arm_compute/runtime/NEON/functions/NEMinMaxLocation.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEMinMaxLocationKernel.h"
+#include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEMinMaxLocation::~NEMinMaxLocation() = default;
 
 NEMinMaxLocation::NEMinMaxLocation()
     : _min_max(), _min_max_loc()
@@ -34,17 +38,21 @@
 
 void NEMinMaxLocation::configure(const IImage *input, void *min, void *max, ICoordinates2DArray *min_loc, ICoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
 {
-    _min_max.configure(input, min, max);
-    _min_max_loc.configure(input, min, max, min_loc, max_loc, min_count, max_count);
+    _min_max = arm_compute::support::cpp14::make_unique<NEMinMaxKernel>();
+    _min_max->configure(input, min, max);
+
+    _min_max_loc = arm_compute::support::cpp14::make_unique<NEMinMaxLocationKernel>();
+    _min_max_loc->configure(input, min, max, min_loc, max_loc, min_count, max_count);
 }
 
 void NEMinMaxLocation::run()
 {
-    _min_max.reset();
+    _min_max->reset();
 
     /* Run min max kernel */
-    NEScheduler::get().schedule(&_min_max, Window::DimY);
+    NEScheduler::get().schedule(_min_max.get(), Window::DimY);
 
     /* Run min max location */
-    NEScheduler::get().schedule(&_min_max_loc, Window::DimY);
+    NEScheduler::get().schedule(_min_max_loc.get(), Window::DimY);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NENonLinearFilter.cpp b/src/runtime/NEON/functions/NENonLinearFilter.cpp
index b7c72ac..4d8fd00 100644
--- a/src/runtime/NEON/functions/NENonLinearFilter.cpp
+++ b/src/runtime/NEON/functions/NENonLinearFilter.cpp
@@ -23,14 +23,15 @@
  */
 #include "arm_compute/runtime/NEON/functions/NENonLinearFilter.h"
 
-#include "arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NENonLinearFilterKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void NENonLinearFilter::configure(ITensor *input, ITensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
                                   BorderMode border_mode,
                                   uint8_t    constant_border_value)
@@ -38,5 +39,9 @@
     auto k = arm_compute::support::cpp14::make_unique<NENonLinearFilterKernel>();
     k->configure(input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler = std::move(b);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp b/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
index 4d9edf7..b8f5c25 100644
--- a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
+++ b/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
@@ -23,25 +23,29 @@
  */
 #include "arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h"
 
-#include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void NENonMaximaSuppression3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode)
 {
     auto k = arm_compute::support::cpp14::make_unique<NENonMaximaSuppression3x3Kernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
 
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
     if(border_mode != BorderMode::UNDEFINED)
     {
-        _border_handler.configure(input, BorderSize(1), BorderMode::CONSTANT, static_cast<float>(0.f));
+        b->configure(input, BorderSize(1), BorderMode::CONSTANT, static_cast<float>(0.f));
     }
     else
     {
-        _border_handler.configure(input, BorderSize(1), BorderMode::UNDEFINED, static_cast<float>(0.f));
+        b->configure(input, BorderSize(1), BorderMode::UNDEFINED, static_cast<float>(0.f));
     }
+    _border_handler = std::move(b);
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
index 10ee938..dfc73b2 100644
--- a/src/runtime/NEON/functions/NENormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp
@@ -29,9 +29,13 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NENormalizationLayerKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NENormalizationLayer::~NENormalizationLayer() = default;
+
 NENormalizationLayer::NENormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _norm_kernel(), _multiply_f(), _input_squared()
 {
@@ -48,7 +52,8 @@
     _memory_group.manage(&_input_squared);
 
     // Configure kernels
-    _norm_kernel.configure(input, &_input_squared, output, norm_info);
+    _norm_kernel = arm_compute::support::cpp14::make_unique<NENormalizationLayerKernel>();
+    _norm_kernel->configure(input, &_input_squared, output, norm_info);
     _multiply_f.configure(input, input, &_input_squared, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
 
     // Allocate the tensor once the configure methods have been called
@@ -70,6 +75,6 @@
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
     _multiply_f.run();
-    NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+    NEScheduler::get().schedule(_norm_kernel.get(), Window::DimY);
 }
 }
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEOpticalFlow.cpp b/src/runtime/NEON/functions/NEOpticalFlow.cpp
index c9e0748..565346b 100644
--- a/src/runtime/NEON/functions/NEOpticalFlow.cpp
+++ b/src/runtime/NEON/functions/NEOpticalFlow.cpp
@@ -25,7 +25,6 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
@@ -33,8 +32,13 @@
 #include "arm_compute/runtime/Pyramid.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NELKTrackerKernel.h"
+#include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEOpticalFlow::~NEOpticalFlow() = default;
 
 NEOpticalFlow::NEOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
     : _memory_group(std::move(memory_manager)),
@@ -110,11 +114,12 @@
         _func_scharr[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value);
 
         // Init Lucas-Kanade kernel
-        _kernel_tracker[i].configure(old_ith_input, new_ith_input, &_scharr_gx[i], &_scharr_gy[i],
-                                     old_points, new_points_estimates, new_points,
-                                     &_old_points_internal, &_new_points_internal,
-                                     termination, use_initial_estimate, epsilon, num_iterations, window_dimension,
-                                     i, _num_levels, pyr_scale);
+        _kernel_tracker[i] = arm_compute::support::cpp14::make_unique<NELKTrackerKernel>();
+        _kernel_tracker[i]->configure(old_ith_input, new_ith_input, &_scharr_gx[i], &_scharr_gy[i],
+                                      old_points, new_points_estimates, new_points,
+                                      &_old_points_internal, &_new_points_internal,
+                                      termination, use_initial_estimate, epsilon, num_iterations, window_dimension,
+                                      i, _num_levels, pyr_scale);
 
         _scharr_gx[i].allocator()->allocate();
         _scharr_gy[i].allocator()->allocate();
@@ -133,6 +138,7 @@
         _func_scharr[level - 1].run();
 
         // Run Lucas-Kanade kernel
-        NEScheduler::get().schedule(&_kernel_tracker[level - 1], Window::DimX);
+        NEScheduler::get().schedule(_kernel_tracker[level - 1].get(), Window::DimX);
     }
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPReluLayer.cpp b/src/runtime/NEON/functions/NEPReluLayer.cpp
index f9393a4..00a1a42 100644
--- a/src/runtime/NEON/functions/NEPReluLayer.cpp
+++ b/src/runtime/NEON/functions/NEPReluLayer.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEPReluLayer.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h"
+#include "src/core/NEON/kernels/NEElementwiseOperationKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp
index 03c597a..92659f3 100644
--- a/src/runtime/NEON/functions/NEPadLayer.cpp
+++ b/src/runtime/NEON/functions/NEPadLayer.cpp
@@ -27,7 +27,10 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/NEON/kernels/NECopyKernel.h"
+#include "src/core/NEON/kernels/NEPadLayerKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
@@ -47,6 +50,8 @@
 }
 } // namespace
 
+NEPadLayer::~NEPadLayer() = default;
+
 NEPadLayer::NEPadLayer()
     : _copy_kernel(), _pad_kernel(), _mode(), _padding(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results()
 {
@@ -54,7 +59,8 @@
 
 void NEPadLayer::configure_constant_mode(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value)
 {
-    _pad_kernel.configure(input, output, padding, constant_value, PaddingMode::CONSTANT);
+    _pad_kernel = arm_compute::support::cpp14::make_unique<NEPadLayerKernel>();
+    _pad_kernel->configure(input, output, padding, constant_value, PaddingMode::CONSTANT);
 }
 
 void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *output)
@@ -195,7 +201,8 @@
     else
     {
         // Copy the input to the whole output if no padding is applied
-        _copy_kernel.configure(input, output);
+        _copy_kernel = arm_compute::support::cpp14::make_unique<NECopyKernel>();
+        _copy_kernel->configure(input, output);
     }
 }
 
@@ -251,7 +258,7 @@
         {
             case PaddingMode::CONSTANT:
             {
-                NEScheduler::get().schedule(&_pad_kernel, Window::DimZ);
+                NEScheduler::get().schedule(_pad_kernel.get(), Window::DimZ);
                 break;
             }
             case PaddingMode::REFLECT:
@@ -280,7 +287,7 @@
     }
     else
     {
-        NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+        NEScheduler::get().schedule(_copy_kernel.get(), Window::DimY);
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPermute.cpp b/src/runtime/NEON/functions/NEPermute.cpp
index 698add8..d2a115f 100644
--- a/src/runtime/NEON/functions/NEPermute.cpp
+++ b/src/runtime/NEON/functions/NEPermute.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEPermute.h"
 
-#include "arm_compute/core/NEON/kernels/NEPermuteKernel.h"
+#include "src/core/NEON/kernels/NEPermuteKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPhase.cpp b/src/runtime/NEON/functions/NEPhase.cpp
index 8577961..3b6182a 100644
--- a/src/runtime/NEON/functions/NEPhase.cpp
+++ b/src/runtime/NEON/functions/NEPhase.cpp
@@ -23,13 +23,13 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEPhase.h"
 
-#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
+#include "src/core/NEON/kernels/NEMagnitudePhaseKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void NEPhase::configure(const ITensor *input1, const ITensor *input2, ITensor *output, PhaseType phase_type)
 {
     if(phase_type == PhaseType::UNSIGNED)
@@ -45,3 +45,4 @@
         _kernel = std::move(k);
     }
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
index 4208878..f7f4437 100644
--- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
+#include "src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index 81bd00d..12ac8d6 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -25,8 +25,13 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEPoolingLayerKernel.h"
+#include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEPoolingLayer::~NEPoolingLayer() = default;
 
 NEPoolingLayer::NEPoolingLayer()
     : _pooling_layer_kernel(), _border_handler(), _is_global_pooling_layer(false), _data_layout(DataLayout::NCHW)
@@ -42,7 +47,8 @@
     _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : pool_info.data_layout;
 
     // Configure pooling kernel
-    _pooling_layer_kernel.configure(input, output, pool_info, indices);
+    _pooling_layer_kernel = arm_compute::support::cpp14::make_unique<NEPoolingLayerKernel>();
+    _pooling_layer_kernel->configure(input, output, pool_info, indices);
 
     switch(_data_layout)
     {
@@ -55,7 +61,8 @@
             {
                 zero_value = PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
             }
-            _border_handler.configure(input, _pooling_layer_kernel.border_size(), border_mode, zero_value);
+            _border_handler = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+            _border_handler->configure(input, _pooling_layer_kernel->border_size(), border_mode, zero_value);
             break;
         }
         case DataLayout::NHWC:
@@ -76,16 +83,18 @@
     {
         case DataLayout::NCHW:
             // Fill border
-            NEScheduler::get().schedule(&_border_handler, Window::DimY);
+            NEScheduler::get().schedule(_border_handler.get(), Window::DimY);
 
             // Run pooling layer
-            NEScheduler::get().schedule(&_pooling_layer_kernel, _is_global_pooling_layer ? Window::DimZ : Window::DimY);
+            NEScheduler::get().schedule(_pooling_layer_kernel.get(), _is_global_pooling_layer ? Window::DimZ : Window::DimY);
             break;
         case DataLayout::NHWC:
             // Run pooling layer
-            NEScheduler::get().schedule(&_pooling_layer_kernel, Window::DimX);
+            NEScheduler::get().schedule(_pooling_layer_kernel.get(), Window::DimX);
             break;
         default:
             ARM_COMPUTE_ERROR("Data layout not supported");
     }
 }
+
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
index bcf6bef..bfa06da 100644
--- a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
+++ b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEPriorBoxLayerKernel.h"
 
 #include "support/MemorySupport.h"
 
diff --git a/src/runtime/NEON/functions/NEQLSTMLayer.cpp b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
index e419624..1013730 100644
--- a/src/runtime/NEON/functions/NEQLSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
@@ -30,7 +30,16 @@
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
@@ -47,6 +56,31 @@
 }
 } // namespace
 
+Status NEQLSTMLayer::validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias)
+{
+    // Output quantization scale will be different, but ignored here
+    // since it will be configured at configure() stage.
+    const TensorInfo out
+    {
+        in
+    };
+    return NEQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias);
+}
+
+void NEQLSTMLayer::configure_layer_norm(NEQLSTMLayer::LayerNormGate g, const ITensor *in)
+{
+    ARM_COMPUTE_ERROR_ON(!_has_layer_norm);
+
+    Tensor &out = get_layer_norm_output(g);
+    _memory_group.manage(&out);
+    out.allocator()->init(*(in->info()));
+
+    get_layer_norm(g) = arm_compute::support::cpp14::make_unique<NEQLSTMLayerNormalizationKernel>();
+    get_layer_norm(g)->configure(in, &out, get_layer_norm_weight(g), get_layer_norm_bias(g));
+}
+
+NEQLSTMLayer::TensorCopyKernel::~TensorCopyKernel() = default;
+
 Status NEQLSTMLayer::TensorCopyKernel::validate(const ITensorInfo &src, const ITensorInfo &dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(src.tensor_shape().num_dimensions() > max_dimension_supported);
@@ -77,7 +111,21 @@
     input_iter, output_iter);
 }
 
+NEQLSTMLayer::~NEQLSTMLayer() = default;
+
 NEQLSTMLayer::NEQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(), _transpose_input_to_forget_weights(), _transpose_input_to_cell_weights(), _transpose_input_to_output_weights(), _transpose_input_to_input_weights(),
+      _transpose_recurrent_to_forget_weights(), _transpose_recurrent_to_cell_weights(), _transpose_recurrent_to_output_weights(), _transpose_recurrent_to_input_weights(), _transpose_projection_weights(),
+      _input_to_input_reduction(), _recurrent_to_input_reduction(), _input_to_forget_reduction(), _recurrent_to_forget_reduction(), _input_to_cell_reduction(), _recurrent_to_cell_reduction(),
+      _input_to_output_reduction(), _recurrent_to_output_reduction(), _projection_reduction(), _projection_bias_add(), _mm_input_to_forget(), _mm_recurrent_to_forget(), _pixelwise_mul_cell_to_forget(),
+      _input_to_forget_outstage(), _recurrent_to_forget_outstage(), _cell_to_forget_outstage(), _accumulate_input_recurrent_forget(), _accumulate_cell_forget(), _forget_gate_sigmoid(), _mm_input_to_cell(),
+      _input_to_cell_outstage(), _mm_recurrent_to_cell(), _recurrent_to_cell_outstage(), _accumulate_input_recurrent_modulation(), _cell_gate_tanh(), _input_gate_sub(), _mm_input_to_input(),
+      _input_to_input_outstage(), _mm_recurrent_to_input(), _recurrent_to_input_outstage(), _accumulate_input_recurrent_input(), _pixelwise_mul_cell_to_input(), _cell_to_input_outstage(),
+      _accumulate_cell_input(), _input_gate_sigmoid(), _pixelwise_mul_forget_cell(), _pixelwise_mul_input_cell(), _add_forget_cell(), _cell_clip(), _mm_input_to_output(), _input_to_output_outstage(),
+      _mm_recurrent_to_output(), _recurrent_to_output_outstage(), _accumulate_input_recurrent_output(), _pixelwise_mul_cell_to_output(), _cell_to_output_outstage(), _accumulate_cell_to_output(),
+      _output_gate_sigmoid(), _hidden_tanh(), _pixelwise_mul_hidden(), _hidden_outstage(), _mm_projection(), _projection_outstage(), _accumulate_projection(), _projection_clip(), _projection_bias_copy(),
+      _projection_output_to_accumulate_copy(), _projection_accumulate_to_output_copy(), _hidden_to_output_copy(), _layer_norms(), _copy_output(), _layer_norm_weights(), _layer_norm_bias(),
+      _layer_norm_output()
 {
     _memory_group = MemoryGroup(std::move(memory_manager));
 }
@@ -178,18 +226,29 @@
         _input_to_input_weights     = lstm_params.input_to_input_weights();
         _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights();
 
-        _input_to_input_reduction.configure(_input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-        _recurrent_to_input_reduction.configure(_recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+        _input_to_input_reduction     = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixAReductionKernel>();
+        _recurrent_to_input_reduction = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixAReductionKernel>();
+        _input_to_input_reduction->configure(_input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+        _recurrent_to_input_reduction->configure(_recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
     }
-    _input_to_forget_reduction.configure(input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_forget_reduction.configure(recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    _input_to_cell_reduction.configure(input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_cell_reduction.configure(recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    _input_to_output_reduction.configure(input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_output_reduction.configure(recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+
+    _input_to_forget_reduction     = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixAReductionKernel>();
+    _recurrent_to_forget_reduction = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixAReductionKernel>();
+    _input_to_cell_reduction       = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixAReductionKernel>();
+    _recurrent_to_cell_reduction   = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixAReductionKernel>();
+    _input_to_output_reduction     = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixAReductionKernel>();
+    _recurrent_to_output_reduction = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixAReductionKernel>();
+
+    _recurrent_to_cell_reduction->configure(input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_forget_reduction->configure(recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_cell_reduction->configure(input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_cell_reduction->configure(recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_output_reduction->configure(input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_output_reduction->configure(recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
     if(_has_projection)
     {
-        _projection_reduction.configure(_projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
+        _projection_reduction = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixAReductionKernel>();
+        _projection_reduction->configure(_projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
         if(_projection_bias != nullptr)
         {
             _projection_bias_add.configure(_projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE);
@@ -878,7 +937,7 @@
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out);
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(output_state_out, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(NECopy::validate(output_state_out, output));
     return Status{};
 }
 
@@ -906,7 +965,7 @@
 
     if(_has_layer_norm)
     {
-        NEScheduler::get().schedule(&get_layer_norm(LayerNormGate::Forget), Window::DimY);
+        NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Forget).get(), Window::DimY);
     }
 
     _forget_gate_sigmoid.run();
@@ -921,7 +980,7 @@
 
     if(_has_layer_norm)
     {
-        NEScheduler::get().schedule(&get_layer_norm(LayerNormGate::Cell), Window::DimY);
+        NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Cell).get(), Window::DimY);
     }
 
     _cell_gate_tanh.run();
@@ -948,7 +1007,7 @@
 
         if(_has_layer_norm)
         {
-            NEScheduler::get().schedule(&get_layer_norm(LayerNormGate::Input), Window::DimY);
+            NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Input).get(), Window::DimY);
         }
 
         _input_gate_sigmoid.run();
@@ -979,7 +1038,7 @@
 
     if(_has_layer_norm)
     {
-        NEScheduler::get().schedule(&get_layer_norm(LayerNormGate::Output), Window::DimY);
+        NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Output).get(), Window::DimY);
     }
 
     _output_gate_sigmoid.run();
@@ -1021,7 +1080,7 @@
     }
 
     // Copy output_state_out to output
-    NEScheduler::get().schedule(&_copy_output, Window::DimY);
+    _copy_output.run();
 }
 
 void NEQLSTMLayer::prepare()
@@ -1051,8 +1110,8 @@
         {
             _input_to_input_eff_bias.allocator()->allocate();
             _recurrent_to_input_eff_bias.allocator()->allocate();
-            NEScheduler::get().schedule(&_input_to_input_reduction, Window::DimY);
-            NEScheduler::get().schedule(&_recurrent_to_input_reduction, Window::DimY);
+            NEScheduler::get().schedule(_input_to_input_reduction.get(), Window::DimY);
+            NEScheduler::get().schedule(_recurrent_to_input_reduction.get(), Window::DimY);
 
             _input_to_input_weights_transposed.allocator()->allocate();
             _recurrent_to_input_weights_transposed.allocator()->allocate();
@@ -1067,17 +1126,17 @@
         _recurrent_to_cell_eff_bias.allocator()->allocate();
         _input_to_output_eff_bias.allocator()->allocate();
         _recurrent_to_output_eff_bias.allocator()->allocate();
-        NEScheduler::get().schedule(&_input_to_forget_reduction, Window::DimY);
-        NEScheduler::get().schedule(&_recurrent_to_forget_reduction, Window::DimY);
-        NEScheduler::get().schedule(&_input_to_cell_reduction, Window::DimY);
-        NEScheduler::get().schedule(&_recurrent_to_cell_reduction, Window::DimY);
-        NEScheduler::get().schedule(&_input_to_output_reduction, Window::DimY);
-        NEScheduler::get().schedule(&_recurrent_to_output_reduction, Window::DimY);
+        NEScheduler::get().schedule(_input_to_forget_reduction.get(), Window::DimY);
+        NEScheduler::get().schedule(_recurrent_to_forget_reduction.get(), Window::DimY);
+        NEScheduler::get().schedule(_input_to_cell_reduction.get(), Window::DimY);
+        NEScheduler::get().schedule(_recurrent_to_cell_reduction.get(), Window::DimY);
+        NEScheduler::get().schedule(_input_to_output_reduction.get(), Window::DimY);
+        NEScheduler::get().schedule(_recurrent_to_output_reduction.get(), Window::DimY);
 
         if(_has_projection)
         {
             _projection_eff_bias.allocator()->allocate();
-            NEScheduler::get().schedule(&_projection_reduction, Window::DimY);
+            NEScheduler::get().schedule(_projection_reduction.get(), Window::DimY);
             if(_projection_bias != nullptr)
             {
                 _projection_bias_add.run();
@@ -1106,5 +1165,4 @@
         _is_prepared = true;
     }
 }
-
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
index c042705..a20ffb8 100644
--- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/kernels/NEQuantizationLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NERNNLayer.cpp b/src/runtime/NEON/functions/NERNNLayer.cpp
index b7415bd..a8e1048 100644
--- a/src/runtime/NEON/functions/NERNNLayer.cpp
+++ b/src/runtime/NEON/functions/NERNNLayer.cpp
@@ -30,9 +30,24 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
+#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+#include "src/core/NEON/kernels/NECopyKernel.h"
+#include "src/core/NEON/kernels/NEFlattenLayerKernel.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NERNNLayer::~NERNNLayer() = default;
+
 NERNNLayer::NERNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_f(), _activation(), _fully_connected(memory_manager), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output(),
       _is_prepared(false)
@@ -99,7 +114,8 @@
     _activation.configure(&_add_output, hidden_state, info);
     _add_output.allocator()->allocate();
 
-    _copy_kernel.configure(hidden_state, output);
+    _copy_kernel = arm_compute::support::cpp14::make_unique<NECopyKernel>();
+    _copy_kernel->configure(hidden_state, output);
 }
 
 void NERNNLayer::run()
@@ -116,7 +132,7 @@
     _activation.run();
 
     // copy hidden out to output
-    NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+    NEScheduler::get().schedule(_copy_kernel.get(), Window::DimY);
 }
 
 void NERNNLayer::prepare()
diff --git a/src/runtime/NEON/functions/NEROIAlignLayer.cpp b/src/runtime/NEON/functions/NEROIAlignLayer.cpp
index a3b116a..a046140 100644
--- a/src/runtime/NEON/functions/NEROIAlignLayer.cpp
+++ b/src/runtime/NEON/functions/NEROIAlignLayer.cpp
@@ -23,7 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEROIAlignLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEROIAlignLayerKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEROIAlignLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
index 4aecadb..8bcf152 100644
--- a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,11 +24,14 @@
 #include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h"
 
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NEROIPoolingLayer::~NEROIPoolingLayer() = default;
+
 NEROIPoolingLayer::NEROIPoolingLayer()
     : _roi_kernel()
 {
@@ -36,11 +39,12 @@
 
 void NEROIPoolingLayer::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
 {
-    _roi_kernel.configure(input, rois, output, pool_info);
+    _roi_kernel = arm_compute::support::cpp14::make_unique<NEROIPoolingLayerKernel>();
+    _roi_kernel->configure(input, rois, output, pool_info);
 }
 
 void NEROIPoolingLayer::run()
 {
-    NEScheduler::get().schedule(&_roi_kernel, Window::DimX);
+    NEScheduler::get().schedule(_roi_kernel.get(), Window::DimX);
 }
 } // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NERange.cpp b/src/runtime/NEON/functions/NERange.cpp
index 138b458..ba166b2 100644
--- a/src/runtime/NEON/functions/NERange.cpp
+++ b/src/runtime/NEON/functions/NERange.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,13 @@
 #include "arm_compute/runtime/NEON/functions/NERange.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NERangeKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NERange::~NERange() = default;
+
 NERange::NERange()
     : _kernel()
 {
@@ -34,7 +38,8 @@
 
 void NERange::configure(ITensor *output, const float start, const float end, const float step)
 {
-    _kernel.configure(output, start, end, step);
+    _kernel = arm_compute::support::cpp14::make_unique<NERangeKernel>();
+    _kernel->configure(output, start, end, step);
 }
 
 Status NERange::validate(const ITensorInfo *output, const float start, const float end, const float step)
@@ -44,6 +49,6 @@
 
 void NERange::run()
 {
-    NEScheduler::get().schedule(&_kernel, Window::DimX);
+    NEScheduler::get().schedule(_kernel.get(), Window::DimX);
 }
 } // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
index c3c5529..b50a925 100644
--- a/src/runtime/NEON/functions/NEReduceMean.cpp
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
@@ -96,6 +97,8 @@
 }
 } // namespace
 
+NEReduceMean::~NEReduceMean() = default;
+
 NEReduceMean::NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _dequant(), _requant(), _reduction_ops(), _keep_dims(), _do_requant(), _input_no_quant(),
       _output_no_quant()
diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index 4938a56..463b65e 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp
@@ -26,7 +26,9 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
@@ -54,6 +56,8 @@
 }
 } // namespace
 
+NEReductionOperation::~NEReductionOperation() = default;
+
 NEReductionOperation::NEReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager), _reduction_kernel(), _reshape(), _output_internal(), _window_split(0), _reduction_axis(), _is_reshape_required(false)
 {
@@ -125,7 +129,8 @@
     ARM_COMPUTE_ERROR_THROW_ON(NEReductionOperation::validate(input->info(), output->info(), axis, op, keep_dims));
 
     // Configure reduction kernel
-    _reduction_kernel.configure(input, output_internal, axis, op);
+    _reduction_kernel = arm_compute::support::cpp14::make_unique<NEReductionOperationKernel>();
+    _reduction_kernel->configure(input, output_internal, axis, op);
     _window_split   = reduction_window_split_dimension(axis);
     _reduction_axis = axis;
 
@@ -139,7 +144,7 @@
 void NEReductionOperation::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
-    NEScheduler::get().schedule(&_reduction_kernel, _window_split);
+    NEScheduler::get().schedule(_reduction_kernel.get(), _window_split);
     if(_is_reshape_required)
     {
         _reshape.run();
diff --git a/src/runtime/NEON/functions/NERemap.cpp b/src/runtime/NEON/functions/NERemap.cpp
index d4e7f83..9276d49 100644
--- a/src/runtime/NEON/functions/NERemap.cpp
+++ b/src/runtime/NEON/functions/NERemap.cpp
@@ -25,17 +25,18 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NERemapKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NERemapKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void NERemap::configure(ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
@@ -45,9 +46,11 @@
     ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported");
 
     auto k = arm_compute::support::cpp14::make_unique<NERemapKernel>();
-
     k->configure(input, map_x, map_y, output, policy);
-
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler = std::move(b);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEReorgLayer.cpp b/src/runtime/NEON/functions/NEReorgLayer.cpp
index dfe002a..77ec7fb 100644
--- a/src/runtime/NEON/functions/NEReorgLayer.cpp
+++ b/src/runtime/NEON/functions/NEReorgLayer.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEReorgLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEReorgLayerKernel.h"
+#include "src/core/NEON/kernels/NEReorgLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEReshapeLayer.cpp b/src/runtime/NEON/functions/NEReshapeLayer.cpp
index c1c88c1..915d5d4 100644
--- a/src/runtime/NEON/functions/NEReshapeLayer.cpp
+++ b/src/runtime/NEON/functions/NEReshapeLayer.cpp
@@ -23,10 +23,10 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/Types.h"
+#include "src/core/NEON/kernels/NEReshapeLayerKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -35,6 +35,8 @@
 {
 namespace experimental
 {
+NEReshape::~NEReshape() = default;
+
 void NEReshape::configure(const ITensorInfo *input, ITensorInfo *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEReshapeLayerKernel>();
diff --git a/src/runtime/NEON/functions/NEReverse.cpp b/src/runtime/NEON/functions/NEReverse.cpp
index c60c84e..3ed0688 100644
--- a/src/runtime/NEON/functions/NEReverse.cpp
+++ b/src/runtime/NEON/functions/NEReverse.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEReverse.h"
 
-#include "arm_compute/core/NEON/kernels/NEReverseKernel.h"
+#include "src/core/NEON/kernels/NEReverseKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index bbf8343..0290fe5 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -32,6 +32,7 @@
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NEScaleKernel.h"
 
 #include "src/core/utils/ScaleUtils.h"
 
diff --git a/src/runtime/NEON/functions/NEScharr3x3.cpp b/src/runtime/NEON/functions/NEScharr3x3.cpp
index bf787e1..cea0eef 100644
--- a/src/runtime/NEON/functions/NEScharr3x3.cpp
+++ b/src/runtime/NEON/functions/NEScharr3x3.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEScharr3x3.h"
 
-#include "arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEScharr3x3Kernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -36,5 +37,8 @@
     auto k = arm_compute::support::cpp14::make_unique<NEScharr3x3Kernel>();
     k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler = std::move(b);
 }
diff --git a/src/runtime/NEON/functions/NESelect.cpp b/src/runtime/NEON/functions/NESelect.cpp
index 8def123..0d1f490 100644
--- a/src/runtime/NEON/functions/NESelect.cpp
+++ b/src/runtime/NEON/functions/NESelect.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NESelect.h"
 
-#include "arm_compute/core/NEON/kernels/NESelectKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/kernels/NESelectKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NESlice.cpp b/src/runtime/NEON/functions/NESlice.cpp
index 2bacf2e..dd56eab 100644
--- a/src/runtime/NEON/functions/NESlice.cpp
+++ b/src/runtime/NEON/functions/NESlice.cpp
@@ -24,10 +24,10 @@
 #include "arm_compute/runtime/NEON/functions/NESlice.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEStridedSliceKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "src/core/NEON/kernels/NEStridedSliceKernel.h"
 
 #include "support/MemorySupport.h"
 
diff --git a/src/runtime/NEON/functions/NESobel3x3.cpp b/src/runtime/NEON/functions/NESobel3x3.cpp
index cfd68d7..38d2dc2 100644
--- a/src/runtime/NEON/functions/NESobel3x3.cpp
+++ b/src/runtime/NEON/functions/NESobel3x3.cpp
@@ -23,18 +23,23 @@
  */
 #include "arm_compute/runtime/NEON/functions/NESobel3x3.h"
 
-#include "arm_compute/core/NEON/kernels/NESobel3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NESobel3x3Kernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void NESobel3x3::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<NESobel3x3Kernel>();
     k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler = std::move(b);
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NESobel5x5.cpp b/src/runtime/NEON/functions/NESobel5x5.cpp
index 092c510..e631fb3 100644
--- a/src/runtime/NEON/functions/NESobel5x5.cpp
+++ b/src/runtime/NEON/functions/NESobel5x5.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,8 +29,13 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NESobel5x5Kernel.h"
+#include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NESobel5x5::~NESobel5x5() = default;
 
 NESobel5x5::NESobel5x5(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
@@ -46,14 +51,18 @@
 
     TensorInfo tensor_info(input->info()->tensor_shape(), Format::S16);
 
+    _sobel_hor      = arm_compute::support::cpp14::make_unique<NESobel5x5HorKernel>();
+    _sobel_vert     = arm_compute::support::cpp14::make_unique<NESobel5x5VertKernel>();
+    _border_handler = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+
     if(run_sobel_x && run_sobel_y)
     {
         _tmp_x.allocator()->init(tensor_info);
         _tmp_y.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_x);
         _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor->configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert->configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
         _tmp_y.allocator()->allocate();
     }
@@ -61,28 +70,29 @@
     {
         _tmp_x.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_x);
-        _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor->configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert->configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
     }
     else if(run_sobel_y)
     {
         _tmp_y.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor->configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert->configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_y.allocator()->allocate();
     }
 
-    _border_handler.configure(input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(input, _sobel_hor->border_size(), border_mode, PixelValue(constant_border_value));
 }
 
 void NESobel5x5::run()
 {
-    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+    NEScheduler::get().schedule(_border_handler.get(), Window::DimZ);
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
-    NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
+    NEScheduler::get().schedule(_sobel_hor.get(), Window::DimY);
+    NEScheduler::get().schedule(_sobel_vert.get(), Window::DimY);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NESobel7x7.cpp b/src/runtime/NEON/functions/NESobel7x7.cpp
index 87ec81f..bc5f87c 100644
--- a/src/runtime/NEON/functions/NESobel7x7.cpp
+++ b/src/runtime/NEON/functions/NESobel7x7.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,8 +29,13 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NESobel7x7Kernel.h"
+#include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NESobel7x7::~NESobel7x7() = default;
 
 NESobel7x7::NESobel7x7(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
@@ -45,6 +50,9 @@
     const bool run_sobel_y = output_y != nullptr;
 
     TensorInfo tensor_info(input->info()->tensor_shape(), Format::S32);
+    _sobel_hor      = arm_compute::support::cpp14::make_unique<NESobel7x7HorKernel>();
+    _sobel_vert     = arm_compute::support::cpp14::make_unique<NESobel7x7VertKernel>();
+    _border_handler = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
 
     if(run_sobel_x && run_sobel_y)
     {
@@ -52,8 +60,8 @@
         _tmp_y.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_x);
         _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor->configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert->configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
         _tmp_y.allocator()->allocate();
     }
@@ -61,28 +69,29 @@
     {
         _tmp_x.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_x);
-        _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor->configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert->configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
     }
     else if(run_sobel_y)
     {
         _tmp_y.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor->configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert->configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_y.allocator()->allocate();
     }
 
-    _border_handler.configure(input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(input, _sobel_hor->border_size(), border_mode, PixelValue(constant_border_value));
 }
 
 void NESobel7x7::run()
 {
-    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+    NEScheduler::get().schedule(_border_handler.get(), Window::DimZ);
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
-    NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
+    NEScheduler::get().schedule(_sobel_hor.get(), Window::DimY);
+    NEScheduler::get().schedule(_sobel_vert.get(), Window::DimY);
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 4f77386..e79ab0e 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -24,14 +24,20 @@
 #include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
 
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NESoftmaxLayerKernel.h"
+#include "src/core/NEON/kernels/NESoftmaxLayerKernel.h"
 #include "src/core/helpers/SoftmaxHelpers.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 template <bool IS_LOG>
+NESoftmaxLayerGeneric<IS_LOG>::~NESoftmaxLayerGeneric() = default;
+
+template <bool IS_LOG>
 NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _permute_input(), _permute_output(), _max_kernel(), _softmax_kernel(), _fill_border_kernel(), _max(), _tmp(), _input_permuted(), _output_permuted(),
       _needs_permute(false)
@@ -76,15 +82,17 @@
     _memory_group.manage(&_max);
     _memory_group.manage(&_tmp);
 
-    // Configure Kernels
-    _max_kernel.configure(tmp_input, &_max);
+    // Configure kernels
+    _max_kernel     = arm_compute::support::cpp14::make_unique<NELogits1DMaxKernel>();
+    _softmax_kernel = arm_compute::support::cpp14::make_unique<NELogits1DSoftmaxKernel<IS_LOG>>();
+    _max_kernel->configure(tmp_input, &_max);
     if(_needs_permute)
     {
         // Add to the memory manager _output_permuted
         _memory_group.manage(&_output_permuted);
 
         // The normalization kernel stores the result in a permuted output tensor
-        _softmax_kernel.configure(tmp_input, &_max, &_output_permuted, beta, &_tmp);
+        _softmax_kernel->configure(tmp_input, &_max, &_output_permuted, beta, &_tmp);
         _input_permuted.allocator()->allocate();
 
         // Re-permute the permuted output into the requested (4D) output
@@ -96,8 +104,9 @@
     else
     {
         // Softmax 2D case
-        _fill_border_kernel.configure(tmp_input, _max_kernel.border_size(), BorderMode::REPLICATE);
-        _softmax_kernel.configure(tmp_input, &_max, output, beta, &_tmp);
+        _fill_border_kernel = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+        _fill_border_kernel->configure(tmp_input, _max_kernel->border_size(), BorderMode::REPLICATE);
+        _softmax_kernel->configure(tmp_input, &_max, output, beta, &_tmp);
     }
 
     // Allocate intermediate buffers
@@ -152,10 +161,13 @@
     {
         _permute_input.run();
     }
+    else
+    {
+        NEScheduler::get().schedule(_fill_border_kernel.get(), Window::DimY);
+    }
 
-    NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
-    NEScheduler::get().schedule(&_max_kernel, Window::DimY);
-    NEScheduler::get().schedule(&_softmax_kernel, Window::DimY);
+    NEScheduler::get().schedule(_max_kernel.get(), Window::DimY);
+    NEScheduler::get().schedule(_softmax_kernel.get(), Window::DimY);
 
     if(_needs_permute)
     {
diff --git a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
index 97e793f..516e8d6 100644
--- a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
+++ b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,9 +29,14 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEMemsetKernel.h"
+#include "src/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NESpaceToBatchLayer::~NESpaceToBatchLayer() = default;
+
 NESpaceToBatchLayer::NESpaceToBatchLayer()
     : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
 {
@@ -43,10 +48,12 @@
 
     if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
-        _has_padding = true;
-        _memset_kernel.configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+        _has_padding   = true;
+        _memset_kernel = arm_compute::support::cpp14::make_unique<NEMemsetKernel>();
+        _memset_kernel->configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
-    _space_to_batch_kernel.configure(input, block_shape, paddings, output);
+    _space_to_batch_kernel = arm_compute::support::cpp14::make_unique<NESpaceToBatchLayerKernel>();
+    _space_to_batch_kernel->configure(input, block_shape, paddings, output);
 }
 
 void NESpaceToBatchLayer::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output)
@@ -55,10 +62,12 @@
 
     if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
-        _has_padding = true;
-        _memset_kernel.configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+        _has_padding   = true;
+        _memset_kernel = arm_compute::support::cpp14::make_unique<NEMemsetKernel>();
+        _memset_kernel->configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
-    _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+    _space_to_batch_kernel = arm_compute::support::cpp14::make_unique<NESpaceToBatchLayerKernel>();
+    _space_to_batch_kernel->configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
 }
 
 Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
@@ -81,8 +90,8 @@
     // Zero out output only if we have paddings
     if(_has_padding)
     {
-        NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
+        NEScheduler::get().schedule(_memset_kernel.get(), Window::DimY);
     }
-    NEScheduler::get().schedule(&_space_to_batch_kernel, Window::DimY);
+    NEScheduler::get().schedule(_space_to_batch_kernel.get(), Window::DimY);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp
index 3e1ec80..a834600 100644
--- a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp
+++ b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,9 +29,13 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NESpaceToDepthLayerKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NESpaceToDepthLayer::~NESpaceToDepthLayer() = default;
+
 NESpaceToDepthLayer::NESpaceToDepthLayer()
     : _space_to_depth_kernel()
 {
@@ -40,7 +44,8 @@
 void NESpaceToDepthLayer::configure(const ITensor *input, ITensor *output, int32_t block_shape)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    _space_to_depth_kernel.configure(input, output, block_shape);
+    _space_to_depth_kernel = arm_compute::support::cpp14::make_unique<NESpaceToDepthLayerKernel>();
+    _space_to_depth_kernel->configure(input, output, block_shape);
 }
 
 Status NESpaceToDepthLayer::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
@@ -51,6 +56,6 @@
 
 void NESpaceToDepthLayer::run()
 {
-    NEScheduler::get().schedule(&_space_to_depth_kernel, Window::DimY);
+    NEScheduler::get().schedule(_space_to_depth_kernel.get(), Window::DimY);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEStackLayer.cpp b/src/runtime/NEON/functions/NEStackLayer.cpp
index a99a95a..e38ff6b 100644
--- a/src/runtime/NEON/functions/NEStackLayer.cpp
+++ b/src/runtime/NEON/functions/NEStackLayer.cpp
@@ -30,9 +30,13 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEStackLayerKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NEStackLayer::~NEStackLayer() = default;
+
 NEStackLayer::NEStackLayer() // NOLINT
     : _input(),
       _stack_kernels(),
@@ -50,7 +54,8 @@
 
     for(unsigned int i = 0; i < _num_inputs; i++)
     {
-        _stack_kernels[i].configure(input[i], axis_u, i, _num_inputs, output);
+        _stack_kernels[i] = arm_compute::support::cpp14::make_unique<NEStackLayerKernel>();
+        _stack_kernels[i]->configure(input[i], axis_u, i, _num_inputs, output);
     }
 }
 
@@ -80,7 +85,7 @@
 {
     for(unsigned i = 0; i < _num_inputs; i++)
     {
-        NEScheduler::get().schedule(&_stack_kernels[i], Window::DimY);
+        NEScheduler::get().schedule(_stack_kernels[i].get(), Window::DimY);
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEStridedSlice.cpp b/src/runtime/NEON/functions/NEStridedSlice.cpp
index 8bf81e8..308b856 100644
--- a/src/runtime/NEON/functions/NEStridedSlice.cpp
+++ b/src/runtime/NEON/functions/NEStridedSlice.cpp
@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/NEON/functions/NEStridedSlice.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEStridedSliceKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/kernels/NEStridedSliceKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NETableLookup.cpp b/src/runtime/NEON/functions/NETableLookup.cpp
index b8d765f..9295bf0 100644
--- a/src/runtime/NEON/functions/NETableLookup.cpp
+++ b/src/runtime/NEON/functions/NETableLookup.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NETableLookup.h"
 
-#include "arm_compute/core/NEON/kernels/NETableLookupKernel.h"
+#include "src/core/NEON/kernels/NETableLookupKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEThreshold.cpp b/src/runtime/NEON/functions/NEThreshold.cpp
index e21511e..2f1e304 100644
--- a/src/runtime/NEON/functions/NEThreshold.cpp
+++ b/src/runtime/NEON/functions/NEThreshold.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEThreshold.h"
 
-#include "arm_compute/core/NEON/kernels/NEThresholdKernel.h"
+#include "src/core/NEON/kernels/NEThresholdKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NETile.cpp b/src/runtime/NEON/functions/NETile.cpp
index 6fda3a5..6a1e20d 100644
--- a/src/runtime/NEON/functions/NETile.cpp
+++ b/src/runtime/NEON/functions/NETile.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NETile.h"
 
-#include "arm_compute/core/NEON/kernels/NETileKernel.h"
+#include "src/core/NEON/kernels/NETileKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NETranspose.cpp b/src/runtime/NEON/functions/NETranspose.cpp
index 88d1672..5af417f 100644
--- a/src/runtime/NEON/functions/NETranspose.cpp
+++ b/src/runtime/NEON/functions/NETranspose.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
 
-#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
+#include "src/core/NEON/kernels/NETransposeKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEUpsampleLayer.cpp b/src/runtime/NEON/functions/NEUpsampleLayer.cpp
index 58c050f..aae5838 100644
--- a/src/runtime/NEON/functions/NEUpsampleLayer.cpp
+++ b/src/runtime/NEON/functions/NEUpsampleLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,10 +23,13 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEUpsampleLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h"
+#include "src/core/NEON/kernels/NEUpsampleLayerKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NEUpsampleLayer::~NEUpsampleLayer() = default;
+
 NEUpsampleLayer::NEUpsampleLayer()
     : _kernel(), _data_layout()
 {
@@ -41,12 +44,13 @@
 void NEUpsampleLayer::configure(const ITensor *input, ITensor *output, const Size2D &info, const InterpolationPolicy &policy)
 {
     _data_layout = input->info()->data_layout();
-    _kernel.configure(input, output, info, policy);
+    _kernel      = arm_compute::support::cpp14::make_unique<NEUpsampleLayerKernel>();
+    _kernel->configure(input, output, info, policy);
 }
 
 void NEUpsampleLayer::run()
 {
     const auto win = (_data_layout == DataLayout::NCHW) ? Window::DimZ : Window::DimX;
-    NEScheduler::get().schedule(&_kernel, win);
+    NEScheduler::get().schedule(_kernel.get(), win);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEWarpAffine.cpp b/src/runtime/NEON/functions/NEWarpAffine.cpp
index ec2c688..b5dbfe0 100644
--- a/src/runtime/NEON/functions/NEWarpAffine.cpp
+++ b/src/runtime/NEON/functions/NEWarpAffine.cpp
@@ -24,8 +24,9 @@
 #include "arm_compute/runtime/NEON/functions/NEWarpAffine.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEWarpKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -58,5 +59,7 @@
             ARM_COMPUTE_ERROR("Interpolation type not supported");
     }
 
-    _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, constant_border_value);
+    _border_handler = std::move(b);
 }
diff --git a/src/runtime/NEON/functions/NEWarpPerspective.cpp b/src/runtime/NEON/functions/NEWarpPerspective.cpp
index bf361b8..8d42121 100644
--- a/src/runtime/NEON/functions/NEWarpPerspective.cpp
+++ b/src/runtime/NEON/functions/NEWarpPerspective.cpp
@@ -24,14 +24,15 @@
 #include "arm_compute/runtime/NEON/functions/NEWarpPerspective.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEWarpKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void NEWarpPerspective::configure(ITensor *input, ITensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
@@ -58,5 +59,8 @@
             ARM_COMPUTE_ERROR("Interpolation type not supported");
     }
 
-    _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, constant_border_value);
+    _border_handler = std::move(b);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index 23b9f60..1cb2458 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -30,6 +30,10 @@
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
 #include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
 #include "support/MemorySupport.h"
 
diff --git a/src/runtime/NEON/functions/NEYOLOLayer.cpp b/src/runtime/NEON/functions/NEYOLOLayer.cpp
index 233afb7..5cad53b 100644
--- a/src/runtime/NEON/functions/NEYOLOLayer.cpp
+++ b/src/runtime/NEON/functions/NEYOLOLayer.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEYOLOLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h"
+#include "src/core/NEON/kernels/NEYOLOLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/TracePoint.cpp b/src/runtime/TracePoint.cpp
index a4228b2..6cb672c 100644
--- a/src/runtime/TracePoint.cpp
+++ b/src/runtime/TracePoint.cpp
@@ -25,10 +25,10 @@
 #include <stdio.h>
 #include <vector>
 
-#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp"
 #include "arm_compute/runtime/Array.h"
 #include "arm_compute/runtime/Pyramid.h"
 #include "arm_compute/runtime/common/LSTMParams.h"
+#include "src/core/NEON/kernels/assembly/arm_gemm.hpp"
 #include "utils/TypePrinter.h"
 
 namespace arm_compute