COMPMID-3639: (3RDPARTY_UPDATE) Move CL kernels to src Change-Id: I10d27db788e5086adae1841e3e2441cd9b76ef84 Signed-off-by: Sang-Hoon Park <sang-hoon.park@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4310 Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>

commit: bef7fa27b0d231a8649952f60808132d109b6345 [log] [tgz]
author: Sang-Hoon Park <sang-hoon.park@arm.com> Wed Oct 21 15:58:54 2020 +0100
committer: Sang-Hoon Park <sang-hoon.park@arm.com> Sat Nov 07 08:07:22 2020 +0000
tree: 7543c66a473d90e28b4860986fad77afa5115043
parent: b9531540dadce8331a703c32456f3c9defdfefa9 [diff]
diff --git a/src/core/CL/CLKernels.h b/src/core/CL/CLKernels.h
new file mode 100644
index 0000000..282cc96
--- /dev/null
+++ b/src/core/CL/CLKernels.h

@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLKERNELS_H
+#define ARM_COMPUTE_CLKERNELS_H
+
+/* Header regrouping all the CL kernels */
+#include "src/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
+#include "src/core/CL/kernels/CLAccumulateKernel.h"
+#include "src/core/CL/kernels/CLActivationLayerKernel.h"
+#include "src/core/CL/kernels/CLArgMinMaxLayerKernel.h"
+#include "src/core/CL/kernels/CLBatchConcatenateLayerKernel.h"
+#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
+#include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
+#include "src/core/CL/kernels/CLBitwiseAndKernel.h"
+#include "src/core/CL/kernels/CLBitwiseNotKernel.h"
+#include "src/core/CL/kernels/CLBitwiseOrKernel.h"
+#include "src/core/CL/kernels/CLBitwiseXorKernel.h"
+#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
+#include "src/core/CL/kernels/CLBox3x3Kernel.h"
+#include "src/core/CL/kernels/CLCannyEdgeKernel.h"
+#include "src/core/CL/kernels/CLChannelCombineKernel.h"
+#include "src/core/CL/kernels/CLChannelExtractKernel.h"
+#include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h"
+#include "src/core/CL/kernels/CLCol2ImKernel.h"
+#include "src/core/CL/kernels/CLColorConvertKernel.h"
+#include "src/core/CL/kernels/CLComparisonKernel.h"
+#include "src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h"
+#include "src/core/CL/kernels/CLConvolutionKernel.h"
+#include "src/core/CL/kernels/CLCopyKernel.h"
+#include "src/core/CL/kernels/CLCropKernel.h"
+#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
+#include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
+#include "src/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
+#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h"
+#include "src/core/CL/kernels/CLDequantizationLayerKernel.h"
+#include "src/core/CL/kernels/CLDerivativeKernel.h"
+#include "src/core/CL/kernels/CLDilateKernel.h"
+#include "src/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
+#include "src/core/CL/kernels/CLElementWiseUnaryLayerKernel.h"
+#include "src/core/CL/kernels/CLElementwiseOperationKernel.h"
+#include "src/core/CL/kernels/CLErodeKernel.h"
+#include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
+#include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
+#include "src/core/CL/kernels/CLFFTScaleKernel.h"
+#include "src/core/CL/kernels/CLFastCornersKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLFlattenLayerKernel.h"
+#include "src/core/CL/kernels/CLFloorKernel.h"
+#include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLGatherKernel.h"
+#include "src/core/CL/kernels/CLGaussian3x3Kernel.h"
+#include "src/core/CL/kernels/CLGaussian5x5Kernel.h"
+#include "src/core/CL/kernels/CLGaussianPyramidKernel.h"
+#include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
+#include "src/core/CL/kernels/CLHOGDescriptorKernel.h"
+#include "src/core/CL/kernels/CLHOGDetectorKernel.h"
+#include "src/core/CL/kernels/CLHarrisCornersKernel.h"
+#include "src/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
+#include "src/core/CL/kernels/CLHistogramKernel.h"
+#include "src/core/CL/kernels/CLIm2ColKernel.h"
+#include "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
+#include "src/core/CL/kernels/CLIntegralImageKernel.h"
+#include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h"
+#include "src/core/CL/kernels/CLLKTrackerKernel.h"
+#include "src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
+#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
+#include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
+#include "src/core/CL/kernels/CLMeanStdDevKernel.h"
+#include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
+#include "src/core/CL/kernels/CLMedian3x3Kernel.h"
+#include "src/core/CL/kernels/CLMemsetKernel.h"
+#include "src/core/CL/kernels/CLMinMaxLayerKernel.h"
+#include "src/core/CL/kernels/CLMinMaxLocationKernel.h"
+#include "src/core/CL/kernels/CLNonLinearFilterKernel.h"
+#include "src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
+#include "src/core/CL/kernels/CLNormalizationLayerKernel.h"
+#include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
+#include "src/core/CL/kernels/CLPadLayerKernel.h"
+#include "src/core/CL/kernels/CLPermuteKernel.h"
+#include "src/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
+#include "src/core/CL/kernels/CLPoolingLayerKernel.h"
+#include "src/core/CL/kernels/CLPriorBoxLayerKernel.h"
+#include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
+#include "src/core/CL/kernels/CLQuantizationLayerKernel.h"
+#include "src/core/CL/kernels/CLROIAlignLayerKernel.h"
+#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
+#include "src/core/CL/kernels/CLRangeKernel.h"
+#include "src/core/CL/kernels/CLReductionOperationKernel.h"
+#include "src/core/CL/kernels/CLRemapKernel.h"
+#include "src/core/CL/kernels/CLReorgLayerKernel.h"
+#include "src/core/CL/kernels/CLReshapeLayerKernel.h"
+#include "src/core/CL/kernels/CLReverseKernel.h"
+#include "src/core/CL/kernels/CLScaleKernel.h"
+#include "src/core/CL/kernels/CLScharr3x3Kernel.h"
+#include "src/core/CL/kernels/CLSelectKernel.h"
+#include "src/core/CL/kernels/CLSobel3x3Kernel.h"
+#include "src/core/CL/kernels/CLSobel5x5Kernel.h"
+#include "src/core/CL/kernels/CLSobel7x7Kernel.h"
+#include "src/core/CL/kernels/CLSoftmaxLayerKernel.h"
+#include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
+#include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
+#include "src/core/CL/kernels/CLStackLayerKernel.h"
+#include "src/core/CL/kernels/CLStridedSliceKernel.h"
+#include "src/core/CL/kernels/CLTableLookupKernel.h"
+#include "src/core/CL/kernels/CLThresholdKernel.h"
+#include "src/core/CL/kernels/CLTileKernel.h"
+#include "src/core/CL/kernels/CLTransposeKernel.h"
+#include "src/core/CL/kernels/CLUpsampleLayerKernel.h"
+#include "src/core/CL/kernels/CLWarpAffineKernel.h"
+#include "src/core/CL/kernels/CLWarpPerspectiveKernel.h"
+#include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
+#include "src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
+#include "src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
+#include "src/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
+#include "src/core/CL/kernels/CLWinogradFilterTransformKernel.h"
+#include "src/core/CL/kernels/CLWinogradInputTransformKernel.h"
+#include "src/core/CL/kernels/CLWinogradOutputTransformKernel.h"
+#include "src/core/CL/kernels/CLYOLOLayerKernel.h"
+#include "src/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h"
+
+#endif /* ARM_COMPUTE_CLKERNELS_H */

diff --git a/src/core/CL/CLTracePoint.cpp b/src/core/CL/CLTracePoint.cpp
index 631cb84..d603f40 100644
--- a/src/core/CL/CLTracePoint.cpp
+++ b/src/core/CL/CLTracePoint.cpp

@@ -23,6 +23,7 @@
  */
 #include "arm_compute/core/TracePoint.h"
 
+#include "arm_compute/core/CL/CLTypes.h"
 #include "arm_compute/core/CL/ICLArray.h"
 #include "arm_compute/core/CL/ICLDistribution1D.h"
 #include "arm_compute/core/CL/ICLHOG.h"
@@ -30,7 +31,6 @@
 #include "arm_compute/core/CL/ICLMultiHOG.h"
 #include "arm_compute/core/CL/ICLMultiImage.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
 #include "utils/TypePrinter.h"
 
 #include <vector>

diff --git a/src/core/CL/CLValidate.h b/src/core/CL/CLValidate.h
index cbbdf2d..7b5294e 100644
--- a/src/core/CL/CLValidate.h
+++ b/src/core/CL/CLValidate.h

@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_CL_VALIDATE_H
 #define ARM_COMPUTE_CL_VALIDATE_H
 
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/Validate.h"
 
 namespace arm_compute

diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index f91510b..2b259bf 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"

diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h
new file mode 100644
index 0000000..a24cd8c
--- /dev/null
+++ b/src/core/CL/ICLKernel.h

@@ -0,0 +1,404 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ICLKERNEL_H
+#define ARM_COMPUTE_ICLKERNEL_H
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLTypes.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/IKernel.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/experimental/Types.h"
+
+#include <string>
+
+namespace arm_compute
+{
+template <typename T>
+class ICLArray;
+class ICLTensor;
+class Window;
+
+/** Common interface for all the OpenCL kernels */
+class ICLKernel : public IKernel
+{
+private:
+    /** Returns the number of arguments enqueued per array object.
+     *
+     * @return The number of arguments enqueued per array object.
+     */
+    template <unsigned int        dimension_size>
+    constexpr static unsigned int num_arguments_per_array()
+    {
+        return num_arguments_per_tensor<dimension_size>();
+    }
+    /** Returns the number of arguments enqueued per tensor object.
+     *
+     * @return The number of arguments enqueued per tensor object.
+     */
+    template <unsigned int        dimension_size>
+    constexpr static unsigned int num_arguments_per_tensor()
+    {
+        return 2 + 2 * dimension_size;
+    }
+    using IKernel::configure; //Prevent children from calling IKernel::configure() directly
+protected:
+    /** Configure the kernel's window and local workgroup size hint.
+     *
+     * @param[in] window   The maximum window which will be returned by window()
+     * @param[in] lws_hint (Optional) Local-Workgroup-Size to use.
+     */
+    void configure_internal(const Window &window, cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange())
+    {
+        _lws_hint = lws_hint;
+        IKernel::configure(window);
+    }
+
+public:
+    /** Constructor */
+    ICLKernel()
+        : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _lws_hint()
+    {
+    }
+    /** Returns a reference to the OpenCL kernel of this object.
+     *
+     * @return A reference to the OpenCL kernel of this object.
+     */
+    cl::Kernel &kernel()
+    {
+        return _kernel;
+    }
+    /** Add the passed 1D array's parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in,out] idx            Index at which to start adding the array's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     array          Array to set as an argument of the object's kernel.
+     * @param[in]     strides        @ref Strides object containing stride of each dimension in bytes.
+     * @param[in]     num_dimensions Number of dimensions of the @p array.
+     * @param[in]     window         Window the kernel will be executed on.
+     */
+    template <typename T>
+    void add_1D_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
+    {
+        add_array_argument<T, 1>(idx, array, strides, num_dimensions, window);
+    }
+    /** Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
+     * @param[in]     window Window the kernel will be executed on.
+     */
+    void add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
+    {
+        add_tensor_argument<1>(idx, tensor, window);
+    }
+    /** Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx if the condition is true.
+     *
+     * @param[in]     cond   Condition to check
+     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
+     * @param[in]     window Window the kernel will be executed on.
+     */
+    void add_1D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
+    {
+        if(cond)
+        {
+            add_1D_tensor_argument(idx, tensor, window);
+        }
+    }
+    /** Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
+     * @param[in]     window Window the kernel will be executed on.
+     */
+    void add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
+    {
+        add_tensor_argument<2>(idx, tensor, window);
+    }
+    /** Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx if the condition is true.
+     *
+     * @param[in]     cond   Condition to check
+     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
+     * @param[in]     window Window the kernel will be executed on.
+     */
+    void add_2D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
+    {
+        if(cond)
+        {
+            add_2D_tensor_argument(idx, tensor, window);
+        }
+    }
+    /** Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
+     * @param[in]     window Window the kernel will be executed on.
+     */
+    void add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
+    {
+        add_tensor_argument<3>(idx, tensor, window);
+    }
+    /** Add the passed 4D tensor's parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
+     * @param[in]     window Window the kernel will be executed on.
+     */
+    void add_4D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
+    {
+        add_tensor_argument<4>(idx, tensor, window);
+    }
+    /** Returns the number of arguments enqueued per 1D array object.
+     *
+     * @return The number of arguments enqueues per 1D array object.
+     */
+    constexpr static unsigned int num_arguments_per_1D_array()
+    {
+        return num_arguments_per_array<1>();
+    }
+    /** Returns the number of arguments enqueued per 1D tensor object.
+     *
+     * @return The number of arguments enqueues per 1D tensor object.
+     */
+    constexpr static unsigned int num_arguments_per_1D_tensor()
+    {
+        return num_arguments_per_tensor<1>();
+    }
+    /** Returns the number of arguments enqueued per 2D tensor object.
+     *
+     * @return The number of arguments enqueues per 2D tensor object.
+     */
+    constexpr static unsigned int num_arguments_per_2D_tensor()
+    {
+        return num_arguments_per_tensor<2>();
+    }
+    /** Returns the number of arguments enqueued per 3D tensor object.
+     *
+     * @return The number of arguments enqueues per 3D tensor object.
+     */
+    constexpr static unsigned int num_arguments_per_3D_tensor()
+    {
+        return num_arguments_per_tensor<3>();
+    }
+    /** Returns the number of arguments enqueued per 4D tensor object.
+     *
+     * @return The number of arguments enqueues per 4D tensor object.
+     */
+    constexpr static unsigned int num_arguments_per_4D_tensor()
+    {
+        return num_arguments_per_tensor<4>();
+    }
+    /** Enqueue the OpenCL kernel to process the given window  on the passed OpenCL command queue.
+     *
+     * @note The queue is *not* flushed by this method, and therefore the kernel will not have been executed by the time this method returns.
+     *
+     * @param[in]     window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     * @param[in,out] queue  Command queue on which to enqueue the kernel.
+     */
+    virtual void run(const Window &window, cl::CommandQueue &queue)
+    {
+        ARM_COMPUTE_UNUSED(window, queue);
+    }
+    /** Enqueue the OpenCL kernel to process the given window  on the passed OpenCL command queue.
+     *
+     * @note The queue is *not* flushed by this method, and therefore the kernel will not have been executed by the time this method returns.
+     *
+     * @param[in]     tensors A vector containing the tensors to operato on.
+     * @param[in]     window  Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     * @param[in,out] queue   Command queue on which to enqueue the kernel.
+     */
+    virtual void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+    {
+        ARM_COMPUTE_UNUSED(tensors, window, queue);
+    }
+    /** Add the passed parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in,out] idx   Index at which to start adding the arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     value Value to set as an argument of the object's kernel.
+     */
+    template <typename T>
+    void add_argument(unsigned int &idx, T value)
+    {
+        _kernel.setArg(idx++, value);
+    }
+
+    /** Set the Local-Workgroup-Size hint
+     *
+     * @note This method should be called after the configuration of the kernel
+     *
+     * @param[in] lws_hint Local-Workgroup-Size to use
+     */
+    void set_lws_hint(const cl::NDRange &lws_hint)
+    {
+        ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); // lws_hint will be overwritten by configure()
+        _lws_hint = lws_hint;
+    }
+
+    /** Return the Local-Workgroup-Size hint
+     *
+     * @return Current lws hint
+     */
+    cl::NDRange lws_hint() const
+    {
+        return _lws_hint;
+    }
+
+    /** Get the configuration ID
+     *
+     * @note The configuration ID can be used by the caller to distinguish different calls of the same OpenCL kernel
+     *       In particular, this method can be used by CLScheduler to keep track of the best LWS for each configuration of the same kernel.
+     *       The configuration ID should be provided only for the kernels potentially affected by the LWS geometry
+     *
+     * @note This method should be called after the configuration of the kernel
+     *
+     * @return configuration id string
+     */
+    const std::string &config_id() const
+    {
+        return _config_id;
+    }
+
+    /** Set the targeted GPU architecture
+     *
+     * @param[in] target The targeted GPU architecture
+     */
+    void set_target(GPUTarget target)
+    {
+        _target = target;
+    }
+
+    /** Set the targeted GPU architecture according to the CL device
+     *
+     * @param[in] device A CL device
+     */
+    void set_target(cl::Device &device);
+
+    /** Get the targeted GPU architecture
+     *
+     * @return The targeted GPU architecture.
+     */
+    GPUTarget get_target() const
+    {
+        return _target;
+    }
+
+    /** Get the maximum workgroup size for the device the CLKernelLibrary uses.
+     *
+     * @return The maximum workgroup size value.
+     */
+    size_t get_max_workgroup_size();
+    /** Get the global work size given an execution window
+     *
+     * @param[in] window Execution window
+     *
+     * @return Global work size of the given execution window
+     */
+    static cl::NDRange gws_from_window(const Window &window);
+
+private:
+    /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in,out] idx            Index at which to start adding the array's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     array          Array to set as an argument of the object's kernel.
+     * @param[in]     strides        @ref Strides object containing stride of each dimension in bytes.
+     * @param[in]     num_dimensions Number of dimensions of the @p array.
+     * @param[in]     window         Window the kernel will be executed on.
+     */
+    template <typename T, unsigned int dimension_size>
+    void add_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window);
+    /** Add the passed tensor's parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
+     * @param[in]     window Window the kernel will be executed on.
+     */
+    template <unsigned int dimension_size>
+    void add_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window);
+
+protected:
+    cl::Kernel  _kernel;             /**< OpenCL kernel to run */
+    GPUTarget   _target;             /**< The targeted GPU */
+    std::string _config_id;          /**< Configuration ID */
+    size_t      _max_workgroup_size; /**< The maximum workgroup size for this kernel */
+private:
+    cl::NDRange _lws_hint; /**< Local workgroup size hint for the OpenCL kernel */
+};
+
+/** Add the kernel to the command queue with the given window.
+ *
+ * @note Depending on the size of the window, this might translate into several jobs being enqueued.
+ *
+ * @note If kernel->kernel() is empty then the function will return without adding anything to the queue.
+ *
+ * @param[in,out] queue                OpenCL command queue.
+ * @param[in]     kernel               Kernel to enqueue
+ * @param[in]     window               Window the kernel has to process.
+ * @param[in]     lws_hint             (Optional) Local workgroup size requested. Default is based on the device target.
+ * @param[in]     use_dummy_work_items (Optional) Use dummy work items in order to have two dimensional power of two NDRange. Default is false
+ *                                     Note: it is kernel responsibility to check if the work-item is out-of-range
+ *
+ * @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed.
+ */
+void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint = CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items = false);
+
+/** Add the passed array's parameters to the object's kernel's arguments starting from the index idx.
+ *
+ * @param[in,out] idx            Index at which to start adding the array's arguments. Will be incremented by the number of kernel arguments set.
+ * @param[in]     array          Array to set as an argument of the object's kernel.
+ * @param[in]     strides        @ref Strides object containing stride of each dimension in bytes.
+ * @param[in]     num_dimensions Number of dimensions of the @p array.
+ * @param[in]     window         Window the kernel will be executed on.
+ */
+template <typename T, unsigned int dimension_size>
+void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON(array == nullptr);
+
+    // Calculate offset to the start of the window
+    unsigned int offset_first_element = 0;
+
+    for(unsigned int n = 0; n < num_dimensions; ++n)
+    {
+        offset_first_element += window[n].start() * strides[n];
+    }
+
+    unsigned int idx_start = idx;
+    _kernel.setArg(idx++, array->cl_buffer());
+
+    for(unsigned int dimension = 0; dimension < dimension_size; dimension++)
+    {
+        _kernel.setArg<cl_uint>(idx++, strides[dimension]);
+        _kernel.setArg<cl_uint>(idx++, strides[dimension] * window[dimension].step());
+    }
+
+    _kernel.setArg<cl_uint>(idx++, offset_first_element);
+
+    ARM_COMPUTE_ERROR_ON_MSG_VAR(idx_start + num_arguments_per_array<dimension_size>() != idx,
+                                 "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_array<dimension_size>());
+    ARM_COMPUTE_UNUSED(idx_start);
+}
+}
+#endif /*ARM_COMPUTE_ICLKERNEL_H */

diff --git a/src/core/CL/ICLSimple2DKernel.cpp b/src/core/CL/ICLSimple2DKernel.cpp
index dfef582..5d8295b 100644
--- a/src/core/CL/ICLSimple2DKernel.cpp
+++ b/src/core/CL/ICLSimple2DKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "src/core/CL/ICLSimple2DKernel.h"
 
 #include "src/core/helpers/WindowHelpers.h"
 

diff --git a/src/core/CL/ICLSimple2DKernel.h b/src/core/CL/ICLSimple2DKernel.h
new file mode 100644
index 0000000..5246492
--- /dev/null
+++ b/src/core/CL/ICLSimple2DKernel.h

@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ICLSIMPLE2DKERNEL_H
+#define ARM_COMPUTE_ICLSIMPLE2DKERNEL_H
+
+#include "src/core/CL/ICLSimpleKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for simple OpenCL kernels having 1 tensor input and 1 tensor output. This interface can be used when the work-item processes a 2D tile */
+class ICLSimple2DKernel : public ICLSimpleKernel
+{
+public:
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+};
+}
+#endif /*ARM_COMPUTE_ICLSIMPLE2DKERNEL_H */

diff --git a/src/core/CL/ICLSimple3DKernel.cpp b/src/core/CL/ICLSimple3DKernel.cpp
index 3d08262..fef1a86 100644
--- a/src/core/CL/ICLSimple3DKernel.cpp
+++ b/src/core/CL/ICLSimple3DKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/ICLSimple3DKernel.h"
+#include "src/core/CL/ICLSimple3DKernel.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"

diff --git a/src/core/CL/ICLSimple3DKernel.h b/src/core/CL/ICLSimple3DKernel.h
new file mode 100644
index 0000000..ff0b274
--- /dev/null
+++ b/src/core/CL/ICLSimple3DKernel.h

@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ICLSIMPLE3DKERNEL_H
+#define ARM_COMPUTE_ICLSIMPLE3DKERNEL_H
+
+#include "src/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for simple OpenCL kernels having 1 tensor input and 1 tensor output.
+ *  Both input tensor and output tensor must have at least 3 dimensions.
+ */
+class ICLSimple3DKernel : public ICLSimple2DKernel
+{
+public:
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+};
+}
+#endif /*ARM_COMPUTE_ICLSIMPLE3DKERNEL_H */

diff --git a/src/core/CL/ICLSimpleKernel.cpp b/src/core/CL/ICLSimpleKernel.cpp
index 90b5be8..d67fefd 100644
--- a/src/core/CL/ICLSimpleKernel.cpp
+++ b/src/core/CL/ICLSimpleKernel.cpp

@@ -21,8 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/ICLSimpleKernel.h"
-
+#include "src/core/CL/ICLSimpleKernel.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/Validate.h"

diff --git a/src/core/CL/ICLSimpleKernel.h b/src/core/CL/ICLSimpleKernel.h
new file mode 100644
index 0000000..b35547a
--- /dev/null
+++ b/src/core/CL/ICLSimpleKernel.h

@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ICLSIMPLEKERNEL_H
+#define ARM_COMPUTE_ICLSIMPLEKERNEL_H
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+/** Interface for simple OpenCL kernels having 1 tensor input and 1 tensor output */
+class ICLSimpleKernel : public ICLKernel
+{
+public:
+    /** Constructor. */
+    ICLSimpleKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ICLSimpleKernel(const ICLSimpleKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ICLSimpleKernel &operator=(const ICLSimpleKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    ICLSimpleKernel(ICLSimpleKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    ICLSimpleKernel &operator=(ICLSimpleKernel &&) = default;
+    /** Default destructor */
+    ~ICLSimpleKernel() = default;
+
+    /** Configure the kernel
+     *
+     * @param[in]  input                             Source tensor.
+     * @param[out] output                            Destination tensor.
+     * @param[in]  num_elems_processed_per_iteration Number of processed elements per iteration.
+     * @param[in]  border_undefined                  (Optional) True if the border mode is undefined. False if it's replicate or constant.
+     * @param[in]  border_size                       (Optional) Size of the border.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined = false, const BorderSize &border_size = BorderSize());
+
+protected:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+};
+}
+
+#endif /*ARM_COMPUTE_ICLSIMPLEKERNEL_H */

diff --git a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp
index 29745be..76b60cb 100644
--- a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp
+++ b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp

@@ -21,14 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
-
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
 #include "src/core/helpers/WindowHelpers.h"
 
 #include <set>

diff --git a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.h b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.h
new file mode 100644
index 0000000..28f28fe
--- /dev/null
+++ b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.h

@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLABSOLUTEDIFFERENCEKERNEL_H
+#define ARM_COMPUTE_CLABSOLUTEDIFFERENCEKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the absolute difference kernel.
+ *
+ * Absolute difference is computed by:
+ * @f[ output(x,y) = | input1(x,y) - input2(x,y) | @f]
+ */
+class CLAbsoluteDifferenceKernel : public ICLKernel
+{
+public:
+    /** Default constructor. */
+    CLAbsoluteDifferenceKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLAbsoluteDifferenceKernel(const CLAbsoluteDifferenceKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLAbsoluteDifferenceKernel &operator=(const CLAbsoluteDifferenceKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLAbsoluteDifferenceKernel(CLAbsoluteDifferenceKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLAbsoluteDifferenceKernel &operator=(CLAbsoluteDifferenceKernel &&) = default;
+    /** Default destructor */
+    ~CLAbsoluteDifferenceKernel() = default;
+
+    /** Set the inputs and output images.
+     *
+     * @param[in]  input1 Source tensor. Data types supported: U8/S16.
+     * @param[in]  input2 Source tensor. Data types supported: U8/S16.
+     * @param[out] output Destination tensor. Data types supported: U8/S16.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+    /** Set the inputs and output images.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input1          Source tensor. Data types supported: U8/S16.
+     * @param[in]  input2          Source tensor. Data types supported: U8/S16.
+     * @param[out] output          Destination tensor. Data types supported: U8/S16.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input1; /**< Source tensor 1. */
+    const ICLTensor *_input2; /**< Source tensor 2. */
+    ICLTensor       *_output; /**< Destination tensor. */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLABSOLUTEDIFFERENCEKERNEL_H */

diff --git a/src/core/CL/kernels/CLAccumulateKernel.cpp b/src/core/CL/kernels/CLAccumulateKernel.cpp
index f161906..b0a8eba 100644
--- a/src/core/CL/kernels/CLAccumulateKernel.cpp
+++ b/src/core/CL/kernels/CLAccumulateKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLAccumulateKernel.h"
+#include "src/core/CL/kernels/CLAccumulateKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLAccumulateKernel.h b/src/core/CL/kernels/CLAccumulateKernel.h
new file mode 100644
index 0000000..16a7153
--- /dev/null
+++ b/src/core/CL/kernels/CLAccumulateKernel.h

@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLACCUMULATEKERNEL_H
+#define ARM_COMPUTE_CLACCUMULATEKERNEL_H
+
+#include "src/core/CL/ICLSimple2DKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the accumulate kernel.
+ *
+ * Accumulation is computed by:
+ * @f[ accum(x,y) = accum(x,y) + input(x,y) @f]
+ */
+class CLAccumulateKernel : public ICLSimple2DKernel
+{
+public:
+    /** Set the input and accumulation tensors.
+     *
+     * @param[in]  input Source tensor. Data types supported: U8.
+     * @param[out] accum Destination tensor. Data types supported: S16.
+     */
+    void configure(const ICLTensor *input, ICLTensor *accum);
+    /** Set the input and accumulation tensors.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: U8.
+     * @param[out] accum           Destination tensor. Data types supported: S16.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *accum);
+};
+
+/** Interface for the accumulate weighted kernel.
+ *
+ * Weighted accumulation is computed:
+ * @f[ accum(x,y) = (1 - \alpha)*accum(x,y) + \alpha*input(x,y) @f]
+ *
+ * Where @f$ 0 \le \alpha \le 1 @f$
+ * Conceptually, the rounding for this is defined as:
+ * @f[ output(x,y)= uint8( (1 - \alpha) * float32( int32( output(x,y) ) ) + \alpha * float32( int32( input(x,y) ) ) ) @f]
+*/
+class CLAccumulateWeightedKernel : public ICLSimple2DKernel
+{
+public:
+    /** Set the input and accumulation images, and the scale value.
+     *
+     * @param[in]     input Source tensor. Data types supported: U8.
+     * @param[in]     alpha Scalar value in the range [0, 1.0]. Data types supported: F32.
+     * @param[in,out] accum Accumulated tensor. Data types supported: U8.
+     */
+    void configure(const ICLTensor *input, float alpha, ICLTensor *accum);
+    /** Set the input and accumulation images, and the scale value.
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in]     input           Source tensor. Data types supported: U8.
+     * @param[in]     alpha           Scalar value in the range [0, 1.0]. Data types supported: F32.
+     * @param[in,out] accum           Accumulated tensor. Data types supported: U8.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, float alpha, ICLTensor *accum);
+};
+
+/** Interface for the accumulate squared kernel.
+ *
+ * The accumulation of squares is computed:
+ * @f[ accum(x,y) = saturate_{int16} ( (uint16) accum(x,y) + (((uint16)(input(x,y)^2)) >> (shift)) ) @f]
+ *
+ * Where @f$ 0 \le shift \le 15 @f$
+*/
+class CLAccumulateSquaredKernel : public ICLSimple2DKernel
+{
+public:
+    /** Set the input and accumulation tensors and the shift value.
+     *
+     * @param[in]     input Source tensor. Data types supported: U8.
+     * @param[in]     shift Shift value in the range of [0, 15]. Data types supported: U32.
+     * @param[in,out] accum Accumulated tensor. Data types supported: S16.
+     */
+    void configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum);
+    /** Set the input and accumulation tensors and the shift value.
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in]     input           Source tensor. Data types supported: U8.
+     * @param[in]     shift           Shift value in the range of [0, 15]. Data types supported: U32.
+     * @param[in,out] accum           Accumulated tensor. Data types supported: S16.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, uint32_t shift, ICLTensor *accum);
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLACCUMULATEKERNEL_H */

diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
index f0e3047..8ddf8d8 100644
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
+#include "src/core/CL/kernels/CLActivationLayerKernel.h"
 
 #include "arm_compute/core/CL/CLCoreRuntimeContext.h"
 #include "arm_compute/core/CL/CLHelpers.h"

diff --git a/src/core/CL/kernels/CLActivationLayerKernel.h b/src/core/CL/kernels/CLActivationLayerKernel.h
new file mode 100644
index 0000000..821418f
--- /dev/null
+++ b/src/core/CL/kernels/CLActivationLayerKernel.h

@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLACTIVATIONLAYERKERNEL_H
+#define ARM_COMPUTE_CLACTIVATIONLAYERKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+/** Interface for the activation layer kernel. */
+class CLActivationLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLActivationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLActivationLayerKernel(const CLActivationLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLActivationLayerKernel &operator=(const CLActivationLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLActivationLayerKernel(CLActivationLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLActivationLayerKernel &operator=(CLActivationLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLActivationLayerKernel() = default;
+    /** Set the input and output tensor.
+     *
+     * @note If the output tensor is a nullptr, the activation function will be performed in-place
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] input           Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
+     *                                 of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
+     * @param[out]     output          Destination tensor. Data type supported: same as @p input
+     * @param[in]      act_info        Activation layer information.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *output, ActivationLayerInfo act_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLActivationLayerKernel
+     *
+     * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
+     *                     of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
+     * @param[in] output   Destination tensor info. Data type supported: same as @p input
+     * @param[in] act_info Activation layer information.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    bool _run_in_place;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLACTIVATIONLAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
index b5a801a..0e6fc65 100644
--- a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h"
+#include "src/core/CL/kernels/CLArgMinMaxLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.h b/src/core/CL/kernels/CLArgMinMaxLayerKernel.h
new file mode 100644
index 0000000..929677f
--- /dev/null
+++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.h

@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLARGMINMAXLAYERKERNEL_H
+#define ARM_COMPUTE_CLARGMINMAXLAYERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the reduction operation kernel
+ *
+ * @note The default data type for an uninitialized output tensor is
+ *       signed 32-bit integer (S32). It is the user's responsibility to check
+ *       that the results do not overflow because the indices are computed
+ *       in unsigned 32-bit (U32).
+ */
+class CLArgMinMaxLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLArgMinMaxLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArgMinMaxLayerKernel(const CLArgMinMaxLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArgMinMaxLayerKernel &operator=(const CLArgMinMaxLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLArgMinMaxLayerKernel(CLArgMinMaxLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLArgMinMaxLayerKernel &operator=(CLArgMinMaxLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLArgMinMaxLayerKernel() = default;
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input       Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
+     * @param[in]  prev_output Destination tensor of the previous iterations of @ref CLArgMinMaxLayerKernel. Data types supported: U32/S32
+     *                         Has to be nullptr for the first iteration
+     * @param[out] output      Destination tensor. Data types supported: U32/S32
+     *                         Output will have the same number of dimensions as input.
+     * @param[in]  axis        Axis along which to reduce. Supported reduction axis : 0,1,2,3
+     * @param[in]  op          Reduction operation to perform. Only ArgMin and ArgMax are supported.
+     */
+    void configure(const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output, unsigned int axis, ReductionOperation op);
+    /** Set the input and output tensors.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
+     * @param[in]  prev_output     Destination tensor of the previous iterations of @ref CLArgMinMaxLayerKernel. Data types supported: U32/S32
+     *                             Has to be nullptr for the first iteration
+     * @param[out] output          Destination tensor. Data types supported: U32/S32
+     *                             Output will have the same number of dimensions as input.
+     * @param[in]  axis            Axis along which to reduce. Supported reduction axis : 0,1,2,3
+     * @param[in]  op              Reduction operation to perform. Only ArgMin and ArgMax are supported.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output, unsigned int axis, ReductionOperation op);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLArgMinMaxLayerKernel.
+     *
+     * @param[in] input       Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
+     * @param[in] prev_output Destination tensor info of the previous iterations. Data types supported: U32/S32
+     *                        Has to be nullptr for the first iteration
+     * @param[in] output      Destination tensor info. Data types supported: U32/S32
+     *                        Output will have the same number of dimensions as input.
+     * @param[in] axis        Axis along which to reduce. Supported reduction axis : 0,1,2,3
+     * @param[in] op          Reduction operation to perform.  Only ArgMin and ArgMax are supported.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *prev_output, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor   *_input;
+    const ICLTensor   *_prev_output;
+    ICLTensor         *_output;
+    unsigned int       _reduction_axis;
+    ReductionOperation _op;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLARGMINMAXLAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp
index 7a8c9ad..7e9424f 100644
--- a/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h"
+#include "src/core/CL/kernels/CLBatchConcatenateLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLBatchConcatenateLayerKernel.h b/src/core/CL/kernels/CLBatchConcatenateLayerKernel.h
new file mode 100644
index 0000000..54a89eb
--- /dev/null
+++ b/src/core/CL/kernels/CLBatchConcatenateLayerKernel.h

@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_CLBATCHCONCATENATEKERNEL_H
+#define ARM_COMPUTE_CLBATCHCONCATENATEKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the batch concatenate kernel.
+ *  The input tensor will be concatenated into the output tensor.
+ */
+class CLBatchConcatenateLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLBatchConcatenateLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLBatchConcatenateLayerKernel(const CLBatchConcatenateLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLBatchConcatenateLayerKernel &operator=(const CLBatchConcatenateLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLBatchConcatenateLayerKernel(CLBatchConcatenateLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLBatchConcatenateLayerKernel &operator=(CLBatchConcatenateLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLBatchConcatenateLayerKernel() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in]     input           Input tensor. Data types supported: All.
+     * @param[in]     batch_offset    The offset on axis # 3.
+     * @param[in,out] output          Output tensor. Data types supported: Same as @p input.
+     *
+     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
+     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
+     *
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int batch_offset, ITensorInfo *output);
+    /**  Static function to check if given info will lead to a valid configuration of @ref CLBatchConcatenateLayerKernel
+     *
+     * @param[in] input        Input tensor info. Data types supported: All.
+     * @param[in] batch_offset The offset on axis # 3.
+     * @param[in] output       Output tensor info. Data types supported: Same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, unsigned int batch_offset, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    unsigned int _batch_offset;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLBATCHCONCATENATEKERNEL_H */

diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index 09b668d..9aeca3b 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
+#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h
new file mode 100644
index 0000000..743f4a9
--- /dev/null
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h

@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H
+#define ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the BatchNormalization layer kernel.
+ */
+class CLBatchNormalizationLayerKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLBatchNormalizationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLBatchNormalizationLayerKernel(const CLBatchNormalizationLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLBatchNormalizationLayerKernel &operator=(const CLBatchNormalizationLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    CLBatchNormalizationLayerKernel(CLBatchNormalizationLayerKernel &&) = default;
+    /** Default move assignment operator */
+    CLBatchNormalizationLayerKernel &operator=(CLBatchNormalizationLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLBatchNormalizationLayerKernel() = default;
+
+    /** Set the input and output tensors.
+     *
+     * @note If the output tensor is a nullptr, the batch normalization function will be performed in-place
+     *
+     * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will store the result.
+     *                          3 lower dimensions represent a single input with dimensions [width, height, FM].
+     *                          The rest are optional and used for representing batches. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
+     * @param[out]     output   Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     * @param[in]      mean     Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]      var      Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]      beta     (Optional) Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for beta is 0. Data types supported: Same as @p input
+     * @param[in]      gamma    (Optional) Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for gamma is 1. Data types supported: Same as @p input
+     * @param[in]      epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
+     * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr, const ICLTensor *gamma = nullptr, float epsilon = 0.001f,
+                   ActivationLayerInfo act_info = ActivationLayerInfo());
+    /** Set the input and output tensors.
+     *
+     * @note If the output tensor is a nullptr, the batch normalization function will be performed in-place
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] input           Source tensor. In case of @p output tensor = nullptr, this tensor will store the result.
+     *                                 3 lower dimensions represent a single input with dimensions [width, height, FM].
+     *                                 The rest are optional and used for representing batches. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
+     * @param[out]     output          Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     * @param[in]      mean            Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]      var             Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]      beta            (Optional) Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for beta is 0. Data types supported: Same as @p input
+     * @param[in]      gamma           (Optional) Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for gamma is 1. Data types supported: Same as @p input
+     * @param[in]      epsilon         (Optional) Small value to avoid division with zero. Default value is 0.001f.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+     */
+    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr,
+                   const ICLTensor *gamma = nullptr, float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CLBatchNormalizationLayerKernel
+     *
+     * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result.
+     *                     3 lower dimensions represent a single input with dimensions [width, height, FM].
+     *                     The rest are optional and used for representing batches. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
+     * @param[in] output   Destination tensor info. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     * @param[in] mean     Mean values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in] var      Variance values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in] beta     (Optional) Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for beta is 0. Data types supported: Same as @p input
+     * @param[in] gamma    (Optional) Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for gamma is 1. Data types supported: Same as @p input
+     * @param[in] epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                           const ITensorInfo *mean, const ITensorInfo *var,
+                           const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
+                           float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    ICLTensor       *_input;
+    ICLTensor       *_output;
+    const ICLTensor *_mean;
+    const ICLTensor *_var;
+    const ICLTensor *_beta;
+    const ICLTensor *_gamma;
+    float            _epsilon;
+    bool             _run_in_place;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
index e5997fb..da41feb 100644
--- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
+#include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h
new file mode 100644
index 0000000..131a43e
--- /dev/null
+++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h

@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLBATCHTOSPACELAYERKERNEL_H
+#define ARM_COMPUTE_CLBATCHTOSPACELAYERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the batch to space kernel */
+class CLBatchToSpaceLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLBatchToSpaceLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLBatchToSpaceLayerKernel(const CLBatchToSpaceLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLBatchToSpaceLayerKernel &operator=(const CLBatchToSpaceLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLBatchToSpaceLayerKernel(CLBatchToSpaceLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLBatchToSpaceLayerKernel &operator=(CLBatchToSpaceLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLBatchToSpaceLayerKernel() = default;
+    /** Initialise the kernel's inputs and output.
+     *
+     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
+     * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
+     * @param[out] output      Tensor output. Data types supported: same as @p input
+     */
+    void configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output);
+    /** Initialise the kernel's inputs and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Tensor input. Supported tensor rank: 4. Data types supported: All.
+     * @param[in]  block_shape     1-D tensor with shape [M]. Data types supported: S32
+     * @param[out] output          Tensor output. Data types supported: same as @p input
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output);
+    /** Initialise the kernel's inputs and output (Static block shape).
+     *
+     * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
+     * @param[in]  block_shape_x Block shape x value.
+     * @param[in]  block_shape_y Block shape y value.
+     * @param[out] output        Tensor output. Data types supported: same as @p input
+     */
+    void configure(const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output);
+    /** Initialise the kernel's inputs and output (Static block shape).
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Tensor input. Supported tensor rank: 4. Data types supported: All.
+     * @param[in]  block_shape_x   Block shape x value.
+     * @param[in]  block_shape_y   Block shape y value.
+     * @param[out] output          Tensor output. Data types supported: same as @p input
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayerKernel
+     *
+     * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
+     * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32
+     * @param[in] output      Tensor output. Data types supported: same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayerKernel (Static block shape).
+     *
+     * @param[in] input         Tensor input. Supported tensor rank: 4. Data types supported: All.
+     * @param[in] block_shape_x Block shape x value.
+     * @param[in] block_shape_y Block shape y value.
+     * @param[in] output        Tensor output. Data types supported: same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;       /**< Source tensor */
+    const ICLTensor *_block_shape; /**< Block shape tensor */
+    ICLTensor       *_output;      /**< Destination tensor */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLBATCHTOSPACELAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLBitwiseAndKernel.cpp b/src/core/CL/kernels/CLBitwiseAndKernel.cpp
index 53a438d..91a6592 100644
--- a/src/core/CL/kernels/CLBitwiseAndKernel.cpp
+++ b/src/core/CL/kernels/CLBitwiseAndKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
+#include "src/core/CL/kernels/CLBitwiseAndKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLBitwiseAndKernel.h b/src/core/CL/kernels/CLBitwiseAndKernel.h
new file mode 100644
index 0000000..01018ee
--- /dev/null
+++ b/src/core/CL/kernels/CLBitwiseAndKernel.h

@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLBITWISEANDKERNEL_H
+#define ARM_COMPUTE_CLBITWISEANDKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the bitwise AND operation kernel.
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = input1(x,y) \land input2(x,y) @f]
+ */
+class CLBitwiseAndKernel : public ICLKernel
+{
+public:
+    /** Default constructor. */
+    CLBitwiseAndKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLBitwiseAndKernel(const CLBitwiseAndKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLBitwiseAndKernel &operator=(const CLBitwiseAndKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLBitwiseAndKernel(CLBitwiseAndKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLBitwiseAndKernel &operator=(CLBitwiseAndKernel &&) = default;
+    /** Set the inputs and output images
+     *
+     * @param[in]  input1 Source tensor. Data types supported: U8.
+     * @param[in]  input2 Source tensor. Data types supported: U8.
+     * @param[out] output Destination tensor. Data types supported: U8.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+    /** Set the inputs and output images
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input1          Source tensor. Data types supported: U8.
+     * @param[in]  input2          Source tensor. Data types supported: U8.
+     * @param[out] output          Destination tensor. Data types supported: U8.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input1; /**< Source tensor 1 */
+    const ICLTensor *_input2; /**< Source tensor 2 */
+    ICLTensor       *_output; /**< Destination tensor */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLBITWISEANDKERNEL_H */

diff --git a/src/core/CL/kernels/CLBitwiseNotKernel.cpp b/src/core/CL/kernels/CLBitwiseNotKernel.cpp
index 08e4c54..118bfe8 100644
--- a/src/core/CL/kernels/CLBitwiseNotKernel.cpp
+++ b/src/core/CL/kernels/CLBitwiseNotKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h"
+#include "src/core/CL/kernels/CLBitwiseNotKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLBitwiseNotKernel.h b/src/core/CL/kernels/CLBitwiseNotKernel.h
new file mode 100644
index 0000000..bf68bc7
--- /dev/null
+++ b/src/core/CL/kernels/CLBitwiseNotKernel.h

@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLBITWISENOTKERNEL_H
+#define ARM_COMPUTE_CLBITWISENOTKERNEL_H
+
+#include "src/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the bitwise NOT operation kernel.
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = \lnot input(x,y) @f]
+ */
+class CLBitwiseNotKernel : public ICLSimple2DKernel
+{
+public:
+    /** Set the inputs and output images.
+     *
+     * @param[in]  input  Source tensor. Data types supported: U8.
+     * @param[out] output Destination tensor. Data types supported: U8.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+    /** Set the inputs and output images.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: U8.
+     * @param[out] output          Destination tensor. Data types supported: U8.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLBITWISENOTKERNEL_H */

diff --git a/src/core/CL/kernels/CLBitwiseOrKernel.cpp b/src/core/CL/kernels/CLBitwiseOrKernel.cpp
index 0e2e5d4..8954d9a 100644
--- a/src/core/CL/kernels/CLBitwiseOrKernel.cpp
+++ b/src/core/CL/kernels/CLBitwiseOrKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h"
+#include "src/core/CL/kernels/CLBitwiseOrKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLBitwiseOrKernel.h b/src/core/CL/kernels/CLBitwiseOrKernel.h
new file mode 100644
index 0000000..c27d0c2
--- /dev/null
+++ b/src/core/CL/kernels/CLBitwiseOrKernel.h

@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLBITWISEORKERNEL_H
+#define ARM_COMPUTE_CLBITWISEORKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the bitwise OR operation kernel.
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = input1(x,y) \lor input2(x,y) @f]
+ */
+class CLBitwiseOrKernel : public ICLKernel
+{
+public:
+    /** Default constructor. */
+    CLBitwiseOrKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLBitwiseOrKernel(const CLBitwiseOrKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLBitwiseOrKernel &operator=(const CLBitwiseOrKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLBitwiseOrKernel(CLBitwiseOrKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLBitwiseOrKernel &operator=(CLBitwiseOrKernel &&) = default;
+    /** Set the inputs and output images
+     *
+     * @param[in]  input1 Source tensor. Data types supported: U8.
+     * @param[in]  input2 Source tensor. Data types supported: U8.
+     * @param[out] output Destination tensor. Data types supported: U8.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+    /** Set the inputs and output images
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input1          Source tensor. Data types supported: U8.
+     * @param[in]  input2          Source tensor. Data types supported: U8.
+     * @param[out] output          Destination tensor. Data types supported: U8.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input1; /**< Source tensor 1 */
+    const ICLTensor *_input2; /**< Source tensor 2 */
+    ICLTensor       *_output; /**< Destination tensor */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLBITWISEORKERNEL_H */

diff --git a/src/core/CL/kernels/CLBitwiseXorKernel.cpp b/src/core/CL/kernels/CLBitwiseXorKernel.cpp
index 65b17c0..69eb38e 100644
--- a/src/core/CL/kernels/CLBitwiseXorKernel.cpp
+++ b/src/core/CL/kernels/CLBitwiseXorKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h"
+#include "src/core/CL/kernels/CLBitwiseXorKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLBitwiseXorKernel.h b/src/core/CL/kernels/CLBitwiseXorKernel.h
new file mode 100644
index 0000000..b4861ea
--- /dev/null
+++ b/src/core/CL/kernels/CLBitwiseXorKernel.h

@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLBITWISEXORKERNEL_H
+#define ARM_COMPUTE_CLBITWISEXORKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the bitwise XOR operation kernel.
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = input1(x,y) \oplus input2(x,y) @f]
+ */
+class CLBitwiseXorKernel : public ICLKernel
+{
+public:
+    /** Default constructor. */
+    CLBitwiseXorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLBitwiseXorKernel(const CLBitwiseXorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLBitwiseXorKernel &operator=(const CLBitwiseXorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLBitwiseXorKernel(CLBitwiseXorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLBitwiseXorKernel &operator=(CLBitwiseXorKernel &&) = default;
+    /** Set the inputs and output images
+     *
+     * @param[in]  input1 Source tensor. Data types supported: U8.
+     * @param[in]  input2 Source tensor. Data types supported: U8.
+     * @param[out] output Destination tensor. Data types supported: U8.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+    /** Set the inputs and output images
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input1          Source tensor. Data types supported: U8.
+     * @param[in]  input2          Source tensor. Data types supported: U8.
+     * @param[out] output          Destination tensor. Data types supported: U8.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input1; /**< Source tensor 1 */
+    const ICLTensor *_input2; /**< Source tensor 2 */
+    ICLTensor       *_output; /**< Destination tensor */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLBITWISEXORKERNEL_H */

diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
index b8c0d2f..bcfd9b8 100644
--- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h"
+#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.h b/src/core/CL/kernels/CLBoundingBoxTransformKernel.h
new file mode 100644
index 0000000..08f350e
--- /dev/null
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.h

@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLBOUNDINGBOXTRANSFORMKERNEL_H
+#define ARM_COMPUTE_CLBOUNDINGBOXTRANSFORMKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the bounding box kernel */
+class CLBoundingBoxTransformKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLBoundingBoxTransformKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLBoundingBoxTransformKernel(const CLBoundingBoxTransformKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLBoundingBoxTransformKernel &operator=(const CLBoundingBoxTransformKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLBoundingBoxTransformKernel(CLBoundingBoxTransformKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLBoundingBoxTransformKernel &operator=(CLBoundingBoxTransformKernel &&) = default;
+    /** Default destructor */
+    ~CLBoundingBoxTransformKernel() = default;
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  boxes      Source tensor. Bounding box proposals in pixel coordinates. Size(M, 4), format [x1, y1, x2, y2]. Data types supported: QASYMM16/F16/F32.
+     * @param[out] pred_boxes Destination tensor. Pixel coordinates of the transformed bounding boxes. Size (M, 4*K), format [x1, y1, x2, y2]. Data types supported: Same as @p input
+     * @param[in]  deltas     Bounding box translations and scales. Size (M, 4*K), format [dx, dy, dw, dh], K  is the number of classes.
+     *                        Data types supported: QASYMM8 if @p input is QASYMM16, otherwise same as @p input
+     * @param[in]  info       Contains BoundingBox operation information described in @ref BoundingBoxTransformInfo.
+     *
+     * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
+     *
+     */
+    void configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
+    /** Set the input and output tensors.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  boxes           Source tensor. Bounding box proposals in pixel coordinates. Size(M, 4), format [x1, y1, x2, y2]. Data types supported: QASYMM16/F16/F32.
+     * @param[out] pred_boxes      Destination tensor. Pixel coordinates of the transformed bounding boxes. Size (M, 4*K), format [x1, y1, x2, y2]. Data types supported: Same as @p input
+     * @param[in]  deltas          Bounding box translations and scales. Size (M, 4*K), format [dx, dy, dw, dh], K  is the number of classes.
+     *                             Data types supported: QASYMM8 if @p input is QASYMM16, otherwise same as @p input
+     * @param[in]  info            Contains BoundingBox operation information described in @ref BoundingBoxTransformInfo.
+     *
+     * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
+     *
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform
+     *
+     * @param[in] boxes      Source tensor info. Bounding box proposals in pixel coordinates. Size(M, 4), format [x1, y1, x2, y2]. Data types supported: QASYMM16/F16/F32.
+     * @param[in] pred_boxes Destination tensor info. Pixel coordinates of the transformed bounding boxes. Size (M, 4*K), format [x1, y1, x2, y2]. Data types supported: Same as @p input
+     * @param[in] deltas     Bounding box translations and scales. Size (M, 4*K), format [dx, dy, dw, dh], K  is the number of classes.
+     *                       Data types supported: QASYMM8 if @p input is QASYMM16, otherwise same as @p input
+     * @param[in] info       Contains BoundingBox operation information described in @ref BoundingBoxTransformInfo.
+     *
+     * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
+     *
+     * @return a Status
+     */
+    static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_boxes;
+    ICLTensor       *_pred_boxes;
+    const ICLTensor *_deltas;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLBOUNDINGBOXTRANSFORMKERNEL_H */

diff --git a/src/core/CL/kernels/CLBox3x3Kernel.cpp b/src/core/CL/kernels/CLBox3x3Kernel.cpp
index 2f6c09d..9f493b4 100644
--- a/src/core/CL/kernels/CLBox3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLBox3x3Kernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h"
+#include "src/core/CL/kernels/CLBox3x3Kernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLBox3x3Kernel.h b/src/core/CL/kernels/CLBox3x3Kernel.h
new file mode 100644
index 0000000..2373c4a
--- /dev/null
+++ b/src/core/CL/kernels/CLBox3x3Kernel.h

@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLBOX3X3KERNEL_H
+#define ARM_COMPUTE_CLBOX3X3KERNEL_H
+
+#include "src/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the box 3x3 filter kernel.
+ *
+ */
+class CLBox3x3Kernel : public ICLSimple2DKernel
+{
+public:
+    /**Initialise the kernel's input and output.
+     *
+     * @param[in]  input            An input tensor. Data types supported: U8
+     * @param[out] output           The output tensor. Data types supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+    /**Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  input            An input tensor. Data types supported: U8
+     * @param[out] output           The output tensor. Data types supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+    //Inherited methods overriden:
+    BorderSize border_size() const override;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLBOX3X3KERNEL_H */

diff --git a/src/core/CL/kernels/CLCannyEdgeKernel.cpp b/src/core/CL/kernels/CLCannyEdgeKernel.cpp
index c76ec67..1fe944c 100644
--- a/src/core/CL/kernels/CLCannyEdgeKernel.cpp
+++ b/src/core/CL/kernels/CLCannyEdgeKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLCannyEdgeKernel.h"
+#include "src/core/CL/kernels/CLCannyEdgeKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLCannyEdgeKernel.h b/src/core/CL/kernels/CLCannyEdgeKernel.h
new file mode 100644
index 0000000..7543822
--- /dev/null
+++ b/src/core/CL/kernels/CLCannyEdgeKernel.h

@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLCANNYEDGEKERNEL_H
+#define ARM_COMPUTE_CLCANNYEDGEKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform Gradient computation.
+ */
+class CLGradientKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLGradientKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGradientKernel(const CLGradientKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGradientKernel &operator=(const CLGradientKernel &) = delete;
+    /** Initialise the kernel's sources, destinations and border mode.
+     *
+     * @note gx, gy and mag must all be the same size (either 16 or 32).
+     *
+     * @param[in]  gx        Source tensor - Gx component. Data types supported: S16/S32.
+     * @param[in]  gy        Source tensor - Gy component. Data types supported: Same as gx.
+     * @param[out] magnitude Destination tensor - Magnitude. Data types supported: U16/U32. Must match the pixel size of gx, gy.
+     * @param[out] phase     Destination tensor - Quantized phase. Data types supported: U8.
+     * @param[in]  norm_type Normalization type. if 1, L1-Norm otherwise L2-Norm.
+     */
+    void configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type);
+    /** Initialise the kernel's sources, destinations and border mode.
+     *
+     * @note gx, gy and mag must all be the same size (either 16 or 32).
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  gx              Source tensor - Gx component. Data types supported: S16/S32.
+     * @param[in]  gy              Source tensor - Gy component. Data types supported: Same as gx.
+     * @param[out] magnitude       Destination tensor - Magnitude. Data types supported: U16/U32. Must match the pixel size of gx, gy.
+     * @param[out] phase           Destination tensor - Quantized phase. Data types supported: U8.
+     * @param[in]  norm_type       Normalization type. if 1, L1-Norm otherwise L2-Norm.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_gx;        /**< Source tensor - Gx component */
+    const ICLTensor *_gy;        /**< Source tensor - Gy component */
+    ICLTensor       *_magnitude; /**< Destination tensor - Magnitude */
+    ICLTensor       *_phase;     /**< Destination tensor - Quantized phase */
+};
+
+/** OpenCL kernel to perform Non-Maxima suppression for Canny Edge.
+ *
+ * @note This kernel is meant to be used alongside CannyEdge and performs a non-maxima suppression using magnitude and phase of input
+ *       to characterize points as possible edges. The output buffer needs to be cleared before this kernel is executed.
+ *
+ * @note Hysteresis is computed in @ref CLEdgeTraceKernel
+ */
+class CLEdgeNonMaxSuppressionKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLEdgeNonMaxSuppressionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLEdgeNonMaxSuppressionKernel(const CLEdgeNonMaxSuppressionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLEdgeNonMaxSuppressionKernel &operator=(const CLEdgeNonMaxSuppressionKernel &) = delete;
+    /** Initialise the kernel's sources, destination and border mode.
+     *
+     * @param[in]  magnitude        Source tensor - Magnitude. Data types supported: U16/U32.
+     * @param[in]  phase            Source tensor - Quantized phase. Data types supported: U8.
+     * @param[out] output           Destination tensor. Data types supported: U16/U32.
+     * @param[in]  lower_thr        Lower threshold.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined);
+    /** Initialise the kernel's sources, destination and border mode.
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  magnitude        Source tensor - Magnitude. Data types supported: U16/U32.
+     * @param[in]  phase            Source tensor - Quantized phase. Data types supported: U8.
+     * @param[out] output           Destination tensor. Data types supported: U16/U32.
+     * @param[in]  lower_thr        Lower threshold.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_magnitude; /**< Source tensor - Magnitude. */
+    const ICLTensor *_phase;     /**< Source tensor - Quantized phase. */
+    ICLTensor       *_output;    /**< Destination tensor. */
+};
+
+/** OpenCL kernel to perform Edge tracing.
+ */
+class CLEdgeTraceKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLEdgeTraceKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLEdgeTraceKernel(const CLEdgeTraceKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLEdgeTraceKernel &operator=(const CLEdgeTraceKernel &) = delete;
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]     input            Source tensor. Data types supported: U16/U32.
+     * @param[out]    output           Destination tensor. Data types supported: U8.
+     * @param[in]     upper_thr        Upper threshold used for the hysteresis
+     * @param[in]     lower_thr        Lower threshold used for the hysteresis
+     * @param[in,out] visited          Tensor for keeping the visited pixels. Data types supported: U32.
+     *                                 Expected to be initialized to 0 before each run.
+     * @param[in,out] recorded         Tensor for keeping the recorded pixels. Data types supported: U32
+     *                                 Expected to be initialized to 0 before each run.
+     * @param[in,out] l1_stack         Tensor with the L1 stack for each pixel. Data types supported: S32.
+     *                                 Expected to be initialized to 0 before each run.
+     * @param[in,out] l1_stack_counter Tensor for counting the elements in the L1 stack of each pixel. Data types supported: U8.
+     *                                              Expected to be initialized to 0 before each run.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr,
+                   ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter);
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]     compile_context  The compile context to be used.
+     * @param[in]     input            Source tensor. Data types supported: U16/U32.
+     * @param[out]    output           Destination tensor. Data types supported: U8.
+     * @param[in]     upper_thr        Upper threshold used for the hysteresis
+     * @param[in]     lower_thr        Lower threshold used for the hysteresis
+     * @param[in,out] visited          Tensor for keeping the visited pixels. Data types supported: U32.
+     *                                 Expected to be initialized to 0 before each run.
+     * @param[in,out] recorded         Tensor for keeping the recorded pixels. Data types supported: U32
+     *                                 Expected to be initialized to 0 before each run.
+     * @param[in,out] l1_stack         Tensor with the L1 stack for each pixel. Data types supported: S32.
+     *                                 Expected to be initialized to 0 before each run.
+     * @param[in,out] l1_stack_counter Tensor for counting the elements in the L1 stack of each pixel. Data types supported: U8.
+     *                                              Expected to be initialized to 0 before each run.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr,
+                   ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;            /**< Source tensor. */
+    ICLTensor       *_output;           /**< Destination tensor. */
+    int32_t          _lower_thr;        /**< Lower threshold used for the hysteresis. */
+    int32_t          _upper_thr;        /**< Upper threshold used for the hysteresis. */
+    ICLTensor       *_visited;          /**< Marks visited elements */
+    ICLTensor       *_recorded;         /**< Marks recorded elements */
+    ICLTensor       *_l1_stack;         /**< L1 hysteris stack */
+    ICLTensor       *_l1_stack_counter; /**< L1 hysteris stack counter */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLCANNYEDGEKERNEL_H */

diff --git a/src/core/CL/kernels/CLChannelCombineKernel.cpp b/src/core/CL/kernels/CLChannelCombineKernel.cpp
index d574f35..52ba9dd 100644
--- a/src/core/CL/kernels/CLChannelCombineKernel.cpp
+++ b/src/core/CL/kernels/CLChannelCombineKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h"
+#include "src/core/CL/kernels/CLChannelCombineKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLMultiImage.h"

diff --git a/src/core/CL/kernels/CLChannelCombineKernel.h b/src/core/CL/kernels/CLChannelCombineKernel.h
new file mode 100644
index 0000000..f19995a
--- /dev/null
+++ b/src/core/CL/kernels/CLChannelCombineKernel.h

@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLCHANNELCOMBINEKERNEL_H
+#define ARM_COMPUTE_CLCHANNELCOMBINEKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+#include <array>
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLMultiImage;
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Interface for the channel combine kernel */
+class CLChannelCombineKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLChannelCombineKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLChannelCombineKernel(const CLChannelCombineKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLChannelCombineKernel &operator=(const CLChannelCombineKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLChannelCombineKernel(CLChannelCombineKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLChannelCombineKernel &operator=(CLChannelCombineKernel &&) = default;
+    /** Default destructor */
+    ~CLChannelCombineKernel() = default;
+    /** Configure function's inputs and outputs.
+     *
+     * @param[in]  plane0 The 2D plane that forms channel 0. Must be of U8 format.
+     * @param[in]  plane1 The 2D plane that forms channel 1. Must be of U8 format.
+     * @param[in]  plane2 The 2D plane that forms channel 2. Must be of U8 format.
+     * @param[in]  plane3 The 2D plane that forms channel 3. Must be of U8 format.
+     * @param[out] output The single planar output tensor. Supported formats: RGB888/RGBA8888/YUYV422/UYVY422.
+     */
+    void configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output);
+    /** Configure function's inputs and outputs.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  plane0          The 2D plane that forms channel 0. Must be of U8 format.
+     * @param[in]  plane1          The 2D plane that forms channel 1. Must be of U8 format.
+     * @param[in]  plane2          The 2D plane that forms channel 2. Must be of U8 format.
+     * @param[in]  plane3          The 2D plane that forms channel 3. Must be of U8 format.
+     * @param[out] output          The single planar output tensor.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output);
+    /** Configure function's inputs and outputs.
+     *
+     * @param[in]  plane0 The 2D plane that forms channel 0. Must be of U8 format.
+     * @param[in]  plane1 The 2D plane that forms channel 1. Must be of U8 format.
+     * @param[in]  plane2 The 2D plane that forms channel 2. Must be of U8 format.
+     * @param[out] output The multi planar output tensor. Supported formats: RGB888/RGBA8888/YUYV422/UYVY422.
+     */
+    void configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output);
+    /** Configure function's inputs and outputs.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  plane0          The 2D plane that forms channel 0. Must be of U8 format.
+     * @param[in]  plane1          The 2D plane that forms channel 1. Must be of U8 format.
+     * @param[in]  plane2          The 2D plane that forms channel 2. Must be of U8 format.
+     * @param[out] output          The multi planar output tensor. Supported formats: RGB888/RGBA8888/YUYV422/UYVY422.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    std::array<const ICLTensor *, 4> _planes;
+    ICLTensor     *_output;
+    ICLMultiImage *_output_multi;
+    std::array<uint32_t, 3> _x_subsampling;
+    std::array<uint32_t, 3> _y_subsampling;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLCHANNELCOMBINEKERNEL_H */

diff --git a/src/core/CL/kernels/CLChannelExtractKernel.cpp b/src/core/CL/kernels/CLChannelExtractKernel.cpp
index 7911b94..cbf504b 100644
--- a/src/core/CL/kernels/CLChannelExtractKernel.cpp
+++ b/src/core/CL/kernels/CLChannelExtractKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLChannelExtractKernel.h"
+#include "src/core/CL/kernels/CLChannelExtractKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLMultiImage.h"

diff --git a/src/core/CL/kernels/CLChannelExtractKernel.h b/src/core/CL/kernels/CLChannelExtractKernel.h
new file mode 100644
index 0000000..37abde5
--- /dev/null
+++ b/src/core/CL/kernels/CLChannelExtractKernel.h

@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLCHANNELEXTRACTKERNEL_H
+#define ARM_COMPUTE_CLCHANNELEXTRACTKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLMultiImage;
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Interface for the channel extract kernel */
+class CLChannelExtractKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLChannelExtractKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLChannelExtractKernel(const CLChannelExtractKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLChannelExtractKernel &operator=(const CLChannelExtractKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLChannelExtractKernel(CLChannelExtractKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLChannelExtractKernel &operator=(CLChannelExtractKernel &&) = default;
+    /** Default destructor */
+    ~CLChannelExtractKernel() = default;
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input   Source tensor. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422
+     * @param[in]  channel Channel to extract.
+     * @param[out] output  Destination tensor. Must be of U8 format.
+     */
+    void configure(const ICLTensor *input, Channel channel, ICLTensor *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422
+     * @param[in]  channel         Channel to extract.
+     * @param[out] output          Destination tensor. Must be of U8 format.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, Channel channel, ICLTensor *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input   Multi-planar source image. Formats supported: NV12/NV21/IYUV/YUV444
+     * @param[in]  channel Channel to extract.
+     * @param[out] output  Single-planar 2D destination image. Must be of U8 format.
+     */
+    void configure(const ICLMultiImage *input, Channel channel, ICLImage *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Multi-planar source image. Formats supported: NV12/NV21/IYUV/YUV444
+     * @param[in]  channel         Channel to extract.
+     * @param[out] output          Single-planar 2D destination image. Must be of U8 format.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLMultiImage *input, Channel channel, ICLImage *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    uint32_t         _num_elems_processed_per_iteration;
+    uint32_t         _subsampling;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLCHANNELEXTRACTKERNEL_H */

diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
index 301a762..c969792 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h"
+#include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.h b/src/core/CL/kernels/CLChannelShuffleLayerKernel.h
new file mode 100644
index 0000000..31c007f
--- /dev/null
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.h

@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLCHANNELSHUFFLELAYERKERNEL_H
+#define ARM_COMPUTE_CLCHANNELSHUFFLELAYERKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the channel shuffle kernel */
+class CLChannelShuffleLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLChannelShuffleLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLChannelShuffleLayerKernel(const CLChannelShuffleLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLChannelShuffleLayerKernel &operator=(const CLChannelShuffleLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLChannelShuffleLayerKernel(CLChannelShuffleLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLChannelShuffleLayerKernel &operator=(CLChannelShuffleLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLChannelShuffleLayerKernel() = default;
+    /** Configure function's inputs and outputs.
+     *
+     * @param[in]  input      Input tensor. Data types supported: All.
+     * @param[out] output     Output tensor. Data type supported: Same as @p input
+     * @param[in]  num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, unsigned int num_groups);
+    /** Configure function's inputs and outputs.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor. Data types supported: All.
+     * @param[out] output          Output tensor. Data type supported: Same as @p input
+     * @param[in]  num_groups      Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLChannelShuffleLayerKernel
+     *
+     * @param[in] input      Input tensor info. Data types supported: All.
+     * @param[in] output     Output tensor info. Data type supported: Same as @p input
+     * @param[in] num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLCHANNELSHUFFLELAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
index 3dc007d..44b8471 100644
--- a/src/core/CL/kernels/CLCol2ImKernel.cpp
+++ b/src/core/CL/kernels/CLCol2ImKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
+#include "src/core/CL/kernels/CLCol2ImKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLCol2ImKernel.h b/src/core/CL/kernels/CLCol2ImKernel.h
new file mode 100644
index 0000000..710e048
--- /dev/null
+++ b/src/core/CL/kernels/CLCol2ImKernel.h

@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLCOL2IMKERNEL_H
+#define ARM_COMPUTE_CLCOL2IMKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the col2im reshaping kernel.
+ *
+ * Rearranges each matrix column into image blocks. It's the inverse operation of @ref CLIm2ColKernel.
+ *
+ * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3:
+ *
+ * @f[
+ * \left( \begin{array}{ccccccccc}
+ * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccc}
+ * a0 & a1 & a2 \\
+ * a3 & a4 & a5 \\
+ * a6 & a7 & a8 \\
+ * \end{array} \right)
+ * @f]
+ */
+class CLCol2ImKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLCol2ImKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLCol2ImKernel(const CLCol2ImKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLCol2ImKernel &operator=(const CLCol2ImKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLCol2ImKernel(CLCol2ImKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLCol2ImKernel &operator=(CLCol2ImKernel &&) = default;
+    /** Default destructor */
+    ~CLCol2ImKernel() = default;
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input          The input tensor to convert. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[out] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
+     *                            while the rest represent batch of outputs. Data types supported: Same as @p input. Data layout: NCHW
+     * @param[in]  convolved_dims Output convolved dimensions.
+     * @param[in]  num_groups     (Optional) Number of groups when performing a grouped convolution
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const Size2D &convolved_dims, unsigned int num_groups = 1);
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           The input tensor to convert. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[out] output          The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
+     *                             while the rest represent batch of outputs. Data types supported: Same as @p input. Data layout: NCHW
+     * @param[in]  convolved_dims  Output convolved dimensions.
+     * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Size2D &convolved_dims, unsigned int num_groups = 1);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLCol2ImKernel
+     *
+     * @param[in] input          The input tensor to convert. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[in] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
+     *                           while the rest represent batch of outputs. Data types supported: Same as @p input. Data layout: NCHW
+     * @param[in] convolved_dims Output convolved dimensions.
+     * @param[in] num_groups     (Optional) Number of groups when performing a grouped convolution
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups = 1);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+public:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    Size2D           _convolved_dims;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLCOL2IMKERNEL_H */

diff --git a/src/core/CL/kernels/CLColorConvertKernel.cpp b/src/core/CL/kernels/CLColorConvertKernel.cpp
index 0f82d87..6c61fec 100644
--- a/src/core/CL/kernels/CLColorConvertKernel.cpp
+++ b/src/core/CL/kernels/CLColorConvertKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLColorConvertKernel.h"
+#include "src/core/CL/kernels/CLColorConvertKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLMultiImage.h"

diff --git a/src/core/CL/kernels/CLColorConvertKernel.h b/src/core/CL/kernels/CLColorConvertKernel.h
new file mode 100644
index 0000000..0f08291
--- /dev/null
+++ b/src/core/CL/kernels/CLColorConvertKernel.h

@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLCOLORCONVERTKERNEL_H
+#define ARM_COMPUTE_CLCOLORCONVERTKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLMultiImage;
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Interface for the color convert kernel.
+ *
+ */
+class CLColorConvertKernel : public ICLKernel
+{
+public:
+    /** Default constructor. */
+    CLColorConvertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLColorConvertKernel(const CLColorConvertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLColorConvertKernel &operator=(const CLColorConvertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLColorConvertKernel(CLColorConvertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLColorConvertKernel &operator=(CLColorConvertKernel &&) = default;
+    /** Default destructor. */
+    ~CLColorConvertKernel() = default;
+
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888
+     * @param[out] output Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422),
+     *                                                          RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/),
+     *                                                          U8 (if the formats of @p input is RGB888)
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888
+     * @param[out] output          Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422),
+     *                                                          RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/),
+     *                                                          U8 (if the formats of @p input is RGB888)
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
+     * @param[out] output Single-planar destination image. Formats supported: RGB888/RGBA8888
+     */
+    void configure(const ICLMultiImage *input, ICLImage *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Multi-planar source image. Formats supported: NV12/NV21/IYUV
+     * @param[out] output          Single-planar destination image. Formats supported: RGB888/RGBA8888
+     */
+    void configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLImage *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
+     * @param[out] output Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888)
+     */
+    void configure(const ICLImage *input, ICLMultiImage *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
+     * @param[out] output          Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888)
+     */
+    void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLMultiImage *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
+     * @param[out] output Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of  @p input is IYUV)
+     */
+    void configure(const ICLMultiImage *input, ICLMultiImage *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Multi-planar source image. Formats supported: NV12/NV21/IYUV
+     * @param[out] output          Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of  @p input is IYUV)
+     */
+    void configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLMultiImage *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor     *_input;        /*pointer to single planar tensor input */
+    ICLTensor           *_output;       /*pointer to single planar tensor output */
+    const ICLMultiImage *_multi_input;  /*pointer to multi-planar input */
+    ICLMultiImage       *_multi_output; /*pointer to multi-planar output */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLCOLORCONVERTKERNEL_H */

diff --git a/src/core/CL/kernels/CLComparisonKernel.cpp b/src/core/CL/kernels/CLComparisonKernel.cpp
index 2b72946..e2aee36 100644
--- a/src/core/CL/kernels/CLComparisonKernel.cpp
+++ b/src/core/CL/kernels/CLComparisonKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLComparisonKernel.h"
+#include "src/core/CL/kernels/CLComparisonKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLComparisonKernel.h b/src/core/CL/kernels/CLComparisonKernel.h
new file mode 100644
index 0000000..0b94190
--- /dev/null
+++ b/src/core/CL/kernels/CLComparisonKernel.h

@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLCOMPARISONKERNEL_H
+#define ARM_COMPUTE_CLCOMPARISONKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ICLTensor;
+
+/** Interface for the comparison kernel. */
+class CLComparisonKernel : public ICLKernel
+{
+public:
+    /** Default constructor. */
+    CLComparisonKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLComparisonKernel(const CLComparisonKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLComparisonKernel &operator=(const CLComparisonKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLComparisonKernel(CLComparisonKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLComparisonKernel &operator=(CLComparisonKernel &&) = default;
+    /** Default destructor */
+    ~CLComparisonKernel() = default;
+    /** Set the inputs and output tensors
+     *
+     * @param[in]  input1    Source tensor. Data types supported: All.
+     * @param[in]  input2    Source tensor. Data types supported: Same as @p input1.
+     * @param[out] output    Destination tensor. Data types supported: U8.
+     * @param[in]  operation Comparison operation to use.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation);
+    /** Set the inputs and output tensors
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input1          Source tensor. Data types supported: All.
+     * @param[in]  input2          Source tensor. Data types supported: Same as @p input1.
+     * @param[out] output          Destination tensor. Data types supported: U8.
+     * @param[in]  operation       Comparison operation to use.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLComparisonKernel
+     *
+     * @param[in] input1    Source tensor. Data types supported: All.
+     * @param[in] input2    Source tensor. Data types supported: Same as @p input1.
+     * @param[in] output    Destination tensor. Data types supported: U8.
+     * @param[in] operation Comparison operation to use.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input1; /**< Source tensor 1 */
+    const ICLTensor *_input2; /**< Source tensor 2 */
+    ICLTensor       *_output; /**< Destination tensor */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLCOMPARISONKERNEL_H */

diff --git a/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp b/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp
index c7888c9..dcf4e66 100644
--- a/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp
+++ b/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h"
+#include "src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h b/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h
new file mode 100644
index 0000000..d1da793
--- /dev/null
+++ b/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h

@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLCONVERTFULLYCONNECTEDWEIGHTSKERNEL_H
+#define ARM_COMPUTE_CLCONVERTFULLYCONNECTEDWEIGHTSKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface to convert the 2D Fully Connected weights from NCHW to NHWC or vice versa.
+ *
+ * @note This function can be applied to the 2D weights used by a Fully Connected layer if:
+ *       - It follows a Convolution layer
+ *       - The data layout used by the network does not match the one the model has been trained in.
+ *
+ * @note This function assumes the weights are already reshaped (transposed)
+ */
+class CLConvertFullyConnectedWeightsKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLConvertFullyConnectedWeightsKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLConvertFullyConnectedWeightsKernel(const CLConvertFullyConnectedWeightsKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLConvertFullyConnectedWeightsKernel &operator=(const CLConvertFullyConnectedWeightsKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLConvertFullyConnectedWeightsKernel(CLConvertFullyConnectedWeightsKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLConvertFullyConnectedWeightsKernel &operator=(CLConvertFullyConnectedWeightsKernel &&) = default;
+    /** Default destructor */
+    ~CLConvertFullyConnectedWeightsKernel() = default;
+    /** Set the input and output tensor.
+     *
+     * @param[in]  input                Source weights tensor to convert. Must be 2 dimensional. Data types supported: All.
+     * @param[out] output               The converted weights tensor. Shape and Data Type: Same as @p input.
+     * @param[in]  original_input_shape Shape of the original input tensor (the one entering fully connected layer).
+     * @param[in]  data_layout          The data layout the weights have been trained in.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
+    /** Set the input and output tensor.
+     *
+     * @param[in]  compile_context      The compile context to be used.
+     * @param[in]  input                Source weights tensor to convert. Must be 2 dimensional. Data types supported: All.
+     * @param[out] output               The converted weights tensor. Shape and Data Type: Same as @p input.
+     * @param[in]  original_input_shape Shape of the original input tensor (the one entering fully connected layer).
+     * @param[in]  data_layout          The data layout the weights have been trained in.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLConvertFullyConnectedWeightsKernel
+     *
+     * @param[in] input                Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
+     * @param[in] output               The converted weights tensor info. Shape and Data Type: Same as @p input.
+     * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer).
+     * @param[in] data_layout          The data layout the weights have been trained in.
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, DataLayout data_layout);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLCONVERTFULLYCONNECTEDWEIGHTSKERNEL_H */

diff --git a/src/core/CL/kernels/CLConvolutionKernel.cpp b/src/core/CL/kernels/CLConvolutionKernel.cpp
index 48b185f..21f1047 100644
--- a/src/core/CL/kernels/CLConvolutionKernel.cpp
+++ b/src/core/CL/kernels/CLConvolutionKernel.cpp

@@ -21,11 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
+#include "src/core/CL/kernels/CLConvolutionKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
@@ -33,6 +32,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CL/ICLKernel.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 

diff --git a/src/core/CL/kernels/CLConvolutionKernel.h b/src/core/CL/kernels/CLConvolutionKernel.h
new file mode 100644
index 0000000..33e73ca
--- /dev/null
+++ b/src/core/CL/kernels/CLConvolutionKernel.h

@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLCONVOLUTIONKERNEL_H
+#define ARM_COMPUTE_CLCONVOLUTIONKERNEL_H
+
+#include "src/core/CL/ICLSimple2DKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/****************************************************************************************\
+ *                                    Square Convolution                                *
+\****************************************************************************************/
+
+/** Interface for the kernel to run an arbitrary size convolution on a tensor. (Currently supports 3x3, 5x5, 7x7 and 9x9).
+ * The client can supply a convolution matrix \f$ C_{m,n} \f$.
+ * @f{eqnarray}{
+ *  k_0 &=& \frac{m}{2}  \\
+ *  l_0 &=& \frac{n}{2}  \\
+ *  sum &=& \sum_{k=0,l=0}^{k=m-1,l=n-1} input(x+k-k_0, y+l-l_0) C_{k,l}
+ *  @f}
+ *
+ * @note The above equation for this function is similar to the default OpenCV Filter2D function,
+ *       which actually computes a correlation and not a convolution.
+ *       In case of a real convolution the convolution matrix should be flipped both horizontally and vertically.
+ */
+template <unsigned int matrix_size>
+class CLConvolutionKernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output           Destination tensor, Data types supported: U8, S16.
+     * @param[in]  conv             Convolution matrix to apply to the input tensor.
+     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined);
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output           Destination tensor, Data types supported: U8, S16.
+     * @param[in]  conv             Convolution matrix to apply to the input tensor.
+     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+
+/** Interface for the kernel which applies a 3x3 convolution to a tensor. */
+using CLConvolution3x3Kernel = CLConvolutionKernel<3>;
+/** Interface for the kernel which applies a 5x5 convolution to a tensor. */
+using CLConvolution5x5Kernel = CLConvolutionKernel<5>;
+/** Interface for the kernel which applies a 7x7 convolution to a tensor. */
+using CLConvolution7x7Kernel = CLConvolutionKernel<7>;
+/** Interface for the kernel which applies a 9x9 convolution to a tensor. */
+using CLConvolution9x9Kernel = CLConvolutionKernel<9>;
+
+/****************************************************************************************\
+ *                              Separable Square Convolution                            *
+\****************************************************************************************/
+
+/** Kernel for the Horizontal pass of a Separable Convolution. Currently support 5x5, 7x7, 9x9 */
+template <unsigned int matrix_size>
+class CLSeparableConvolutionHorKernel : public ICLSimple2DKernel
+{
+public:
+    /** Default Constructor */
+    CLSeparableConvolutionHorKernel();
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output           Destination tensor, Data types supported: S16.
+     * @param[in]  conv             Convolution matrix to apply to the input tensor.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined);
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output           Destination tensor, Data types supported: U16/S16/S32.
+     * @param[in]  conv             Convolution matrix to apply to the input tensor.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+
+private:
+    BorderSize _border_size; /**< Border size */
+};
+
+/** Interface for the kernel which applies a horizontal pass of 5x5 convolution to a tensor. */
+using CLSeparableConvolution5x5HorKernel = CLSeparableConvolutionHorKernel<5>;
+/** Interface for the kernel which applies a horizontal pass of 7x7 convolution to a tensor. */
+using CLSeparableConvolution7x7HorKernel = CLSeparableConvolutionHorKernel<7>;
+/** Interface for the kernel which applies a horizontal pass of 9x9 convolution to a tensor. */
+using CLSeparableConvolution9x9HorKernel = CLSeparableConvolutionHorKernel<9>;
+
+/** Kernel for the Vertical pass of a Separable Convolution. Currently supports 5x5, 7x7, 9x9 */
+template <unsigned int matrix_size>
+class CLSeparableConvolutionVertKernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U16/S16/S32.
+     * @param[out] output           Destination tensor, Data types supported: U8, S16.
+     * @param[in]  conv             Convolution matrix to apply to the input tensor.
+     * @param[in]  scale            Scale of the convolution matrix.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     * @param[in]  data_type        Data type to use for intermeidate result. @sa data_type_for_convolution
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type = DataType::S32);
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  input            Source tensor. Data types supported: U16/S16/S32.
+     * @param[out] output           Destination tensor, Data types supported: U8, S16.
+     * @param[in]  conv             Convolution matrix to apply to the input tensor.
+     * @param[in]  scale            Scale of the convolution matrix.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     * @param[in]  data_type        Data type to use for intermeidate result. @sa data_type_for_convolution
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type = DataType::S32);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+
+/** Interface for the kernel which applies a vertical pass of 5x5 convolution to a tensor. */
+using CLSeparableConvolution5x5VertKernel = CLSeparableConvolutionVertKernel<5>;
+/** Interface for the kernel which applies a vertical pass of 7x7 convolution to a tensor. */
+using CLSeparableConvolution7x7VertKernel = CLSeparableConvolutionVertKernel<7>;
+/** Interface for the kernel which applies a vertical pass of 9x9 convolution to a tensor. */
+using CLSeparableConvolution9x9VertKernel = CLSeparableConvolutionVertKernel<9>;
+
+/****************************************************************************************\
+ *                                 Rectangle Convolution                                *
+\****************************************************************************************/
+
+/** Kernel for the running convolution on a rectangle matrix.
+ *
+ * @note Supports combinations of 3,5,7 and 9.
+ */
+class CLConvolutionRectangleKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLConvolutionRectangleKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLConvolutionRectangleKernel(const CLConvolutionRectangleKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLConvolutionRectangleKernel &operator=(const CLConvolutionRectangleKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLConvolutionRectangleKernel(CLConvolutionRectangleKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLConvolutionRectangleKernel &operator=(CLConvolutionRectangleKernel &&) = default;
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output           Destination tensor, Data types supported: U8, S16.
+     * @param[in]  conv             Convolution matrix to apply to the input tensor.
+     * @param[in]  width            Width of convolution matrix (Number of columns)
+     * @param[in]  height           Height of convolution matrix (Number of rows)
+     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined);
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output           Destination tensor, Data types supported: U8, S16.
+     * @param[in]  conv             Convolution matrix to apply to the input tensor.
+     * @param[in]  width            Width of convolution matrix (Number of columns)
+     * @param[in]  height           Height of convolution matrix (Number of rows)
+     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    BorderSize       _border_size;
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLCONVOLUTIONKERNEL_H */

diff --git a/src/core/CL/kernels/CLCopyKernel.cpp b/src/core/CL/kernels/CLCopyKernel.cpp
index 184b80c..ca38b65 100644
--- a/src/core/CL/kernels/CLCopyKernel.cpp
+++ b/src/core/CL/kernels/CLCopyKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
+#include "src/core/CL/kernels/CLCopyKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLCopyKernel.h b/src/core/CL/kernels/CLCopyKernel.h
new file mode 100644
index 0000000..9a20b88
--- /dev/null
+++ b/src/core/CL/kernels/CLCopyKernel.h

@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLCOPYKERNEL_H
+#define ARM_COMPUTE_CLCOPYKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform a copy between two tensors */
+class CLCopyKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLCopyKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLCopyKernel(const CLCopyKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLCopyKernel &operator=(const CLCopyKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLCopyKernel(CLCopyKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLCopyKernel &operator=(CLCopyKernel &&) = default;
+    /** Initialize the kernel's input, output.
+     *
+     * @param[in]  input         Source tensor. Data types supported: All.
+     * @param[out] output        Destination tensor. Data types supported: same as @p input.
+     * @param[in]  output_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, Window *output_window = nullptr);
+    /** Initialize the kernel's input, output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: All.
+     * @param[out] output          Destination tensor. Data types supported: same as @p input.
+     * @param[in]  output_window   (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, Window *output_window = nullptr);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLCopyKernel
+     *
+     * @param[in] input         Source tensor info. Data types supported: All.
+     * @param[in] output        Destination tensor info. Data types supported: same as @p input.
+     * @param[in] output_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, Window *output_window = nullptr);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    Window           _output_window;
+    bool             _has_output_window;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLCOPYKERNEL_H */

diff --git a/src/core/CL/kernels/CLCropKernel.cpp b/src/core/CL/kernels/CLCropKernel.cpp
index 2c99d46..9cf15ff 100644
--- a/src/core/CL/kernels/CLCropKernel.cpp
+++ b/src/core/CL/kernels/CLCropKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLCropKernel.h"
+#include "src/core/CL/kernels/CLCropKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLCropKernel.h b/src/core/CL/kernels/CLCropKernel.h
new file mode 100644
index 0000000..cbfada5
--- /dev/null
+++ b/src/core/CL/kernels/CLCropKernel.h

@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLCROPKERNEL_H
+#define ARM_COMPUTE_CLCROPKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform a copy between two tensors */
+class CLCropKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLCropKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLCropKernel(const CLCropKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLCropKernel &operator=(const CLCropKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLCropKernel(CLCropKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLCropKernel &operator=(CLCropKernel &&) = default;
+    /** Configure kernel
+     *
+     * @note Supported tensor rank: up to 4
+     *
+     * @param[in]  input               Source tensor. Data type supported: All. Data layouts supported: NHWC.
+     * @param[out] output              Destination tensor. Data type supported: F32
+     * @param[in]  start               Coordinates of where to start cropping the image.
+     * @param[in]  end                 Coordinates of where to end cropping the image.
+     * @param[in]  batch_index         Fourth dimension index of the 3D image to crop in @p input.
+     * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
+     * @param[in]  output_window       Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0, Window *output_window = nullptr);
+    /** Configure kernel
+     *
+     * @note Supported tensor rank: up to 4
+     *
+     * @param[in]  compile_context     The compile context to be used.
+     * @param[in]  input               Source tensor. Data type supported: All. Data layouts supported: NHWC.
+     * @param[out] output              Destination tensor. Data type supported: F32
+     * @param[in]  start               Coordinates of where to start cropping the image.
+     * @param[in]  end                 Coordinates of where to end cropping the image.
+     * @param[in]  batch_index         Fourth dimension index of the 3D image to crop in @p input.
+     * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
+     * @param[in]  output_window       Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
+                   Window *output_window = nullptr);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
+     *
+     * @note Supported tensor rank: up to 4
+     *
+     * @param[in] input               Source tensor info. Data type supported: All. Data layouts supported: NHWC.
+     * @param[in] output              Destination tensor info. Data type supported: F32
+     * @param[in] start               Coordinates of where to start cropping the image.
+     * @param[in] end                 Coordinates of where to end cropping the image.
+     * @param[in] batch_index         Fourth dimension index of the 3D image to crop in @p input.
+     * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0.
+     * @param[in] output_window       Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
+                           Window *output_window = nullptr);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    Coordinates2D    _start;
+    uint32_t         _batch_index;
+    float            _extrapolation_value;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLCROPKERNEL_H */

diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
index 9ba3dc3..d01a00d 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
+#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
new file mode 100644
index 0000000..e0d1322
--- /dev/null
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h

@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLEKERNEL_H
+#define ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLEKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the Deconvolution layer kernel on OpenCL.
+ */
+class CLDeconvolutionLayerUpsampleKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLDeconvolutionLayerUpsampleKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDeconvolutionLayerUpsampleKernel(const CLDeconvolutionLayerUpsampleKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDeconvolutionLayerUpsampleKernel &operator=(const CLDeconvolutionLayerUpsampleKernel &) = delete;
+    /** Default Move Constructor. */
+    CLDeconvolutionLayerUpsampleKernel(CLDeconvolutionLayerUpsampleKernel &&) = default;
+    /** Default move assignment operator */
+    CLDeconvolutionLayerUpsampleKernel &operator=(CLDeconvolutionLayerUpsampleKernel &&) = default;
+    /** Default destructor */
+    ~CLDeconvolutionLayerUpsampleKernel() = default;
+
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Source tensor. Data types supported: All.
+     * @param[out] output Destination tensor. Data types supported: same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]  info   Contains padding and stride information described in @ref PadStrideInfo.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: All.
+     * @param[out] output          Destination tensor. Data types supported: same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]  info            Contains padding and stride information described in @ref PadStrideInfo.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayerUpsample
+     *
+     * @param[in] input  Source tensor info. Data types supported: All.
+     * @param[in] output Destination tensor info. Data types supported: same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in] info   Contains padding and stride information described in @ref PadStrideInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    PadStrideInfo    _info;
+    DataLayout       _data_layout;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLEKERNEL_H */

diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
index 1514d90..ea22ec0 100644
--- a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
+#include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
new file mode 100644
index 0000000..ce354fa
--- /dev/null
+++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h

@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLDECONVOLUTIONLAYERRESHAPEOUTPUTKERNEL_H
+#define ARM_COMPUTE_CLDECONVOLUTIONLAYERRESHAPEOUTPUTKERNEL_H
+
+#include "src/core/CL/ICLSimpleKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the OpenCL kernel to be used for reshaping the tensor before returning the result of deconvolution.
+ *
+ * The input tensor to this OpenCL kernel is expected to be the result of a @ref CLGEMM operation between the Deconvolution input and the Deconvolution filter.
+ *
+ * The input tensor should have the following shape: [filter_width * filter_height * ofms, width, height, batch_size]
+ *
+ * The output tensor should have the following shape: [stride_x * (input_width - 1) + filter_width - 2 * padx, stride_y * (input_height - 1) + filter_height - 2 * pady, ofms, batch_size]
+ *
+ * For example, given a tensor with dimensions [4, 2, 2] this function returns a tensor with dimensions [1, 4, 4].
+ *
+ */
+class CLDeconvolutionReshapeOutputKernel : public ICLSimpleKernel
+{
+public:
+    /** Default constructor */
+    CLDeconvolutionReshapeOutputKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDeconvolutionReshapeOutputKernel(const CLDeconvolutionReshapeOutputKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDeconvolutionReshapeOutputKernel &operator=(const CLDeconvolutionReshapeOutputKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLDeconvolutionReshapeOutputKernel(CLDeconvolutionReshapeOutputKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLDeconvolutionReshapeOutputKernel &operator=(CLDeconvolutionReshapeOutputKernel &&) = default;
+    /** Default destructor */
+    ~CLDeconvolutionReshapeOutputKernel() = default;
+
+    /** Initialise the kernel's source and destination.
+     *
+     * @param[in]  input        Input tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
+     * @param[in]  bias         Bias tensor to be added directly during the reshape operation. Supported data types: same as @p input.  Supported data layouts: same as @p input.
+     * @param[out] output       Output tensor with the following shape: [stride_x * (input_width - 1) + filter_width - 2 * padx, stride_y * (input_height - 1) + filter_height - 2 * pady, ofms, batch_size]
+     *                          Supported data types: same as @p input.  Supported data layouts: same as @p input.
+     * @param[in]  input_info   Deconvolution input tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
+     * @param[in]  weights_info Deconvolution weights tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
+     * @param[in]  deconv_info  Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
+     */
+    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info);
+    /** Initialise the kernel's source and destination.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
+     * @param[in]  bias            Bias tensor to be added directly during the reshape operation. Supported data types: same as @p input.  Supported data layouts: same as @p input.
+     * @param[out] output          Output tensor with the following shape: [stride_x * (input_width - 1) + filter_width - 2 * padx, stride_y * (input_height - 1) + filter_height - 2 * pady, ofms, batch_size]
+     *                             Supported data types: same as @p input.  Supported data layouts: same as @p input.
+     * @param[in]  input_info      Deconvolution input tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
+     * @param[in]  weights_info    Deconvolution weights tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
+     * @param[in]  deconv_info     Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+                   const PadStrideInfo &deconv_info);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref  CLDeconvolutionReshapeOutputKernel.
+     *
+     * @param[in] input        GEMM output tensor info to be reshaped. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
+     * @param[in] bias         (Optional) Optional bias tensor info to be added directly during the reshape operation. Supported data types: same as @p input.  Supported data layouts: same as @p input.
+     * @param[in] output       Reshaped output tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
+     * @param[in] input_info   Original input tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
+     * @param[in] weights_info Original weights tensor info output. Supported data types: same as @p input.  Supported data layouts: same as @p input.
+     * @param[in] deconv_info  Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
+     *
+     * @return a Status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    bool             _add_bias;
+    const ICLTensor *_bias;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLDECONVOLUTIONLAYERRESHAPEOUTPUTKERNEL_H */

diff --git a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
index cb5d727..78adfd2 100644
--- a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
+#include "src/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.h b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.h
new file mode 100644
index 0000000..6c73bd4
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.h

@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H
+#define ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+/** Interface for the depth concatenate kernel.
+ *  The input tensor will be concatenated into the output tensor.
+ */
+class CLDepthConcatenateLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLDepthConcatenateLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDepthConcatenateLayerKernel(const CLDepthConcatenateLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDepthConcatenateLayerKernel &operator=(const CLDepthConcatenateLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLDepthConcatenateLayerKernel(CLDepthConcatenateLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLDepthConcatenateLayerKernel &operator=(CLDepthConcatenateLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLDepthConcatenateLayerKernel() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in]     input           Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]     depth_offset    The offset on the Z axis.
+     * @param[in,out] output          Output tensor. Data types supported: Same as @p input.
+     *
+     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
+     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
+     *
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int depth_offset, ITensorInfo *output);
+    /**  Static function to check if given info will lead to a valid configuration of @ref CLDepthConcatenateLayerKernel
+     *
+     * @param[in] input        Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[in] depth_offset The offset on the Z axis.
+     * @param[in] output       Output tensor info. Data types supported: Same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, unsigned int depth_offset, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    unsigned int _depth_offset;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H */

diff --git a/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp b/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
index 452a14b..c98d66f 100644
--- a/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLDepthConvertLayerKernel.h b/src/core/CL/kernels/CLDepthConvertLayerKernel.h
new file mode 100644
index 0000000..8b511c6
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthConvertLayerKernel.h

@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLDEPTHCONVERTKERNEL_H
+#define ARM_COMPUTE_CLDEPTHCONVERTKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLSimple3DKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the depth conversion kernel. */
+class CLDepthConvertLayerKernel : public ICLSimple3DKernel
+{
+public:
+    /** Set the input and output of the kernel.
+     *
+     * Valid conversions Input -> Output :
+     *
+     *   - QSYMM8_PER_CHANNEL -> QASYMM8 (ATTENTION: it is the user's responsibility to keep track of the quantization info in the TensorInfo meta-data)
+     *   - U8  -> S8, U16, S16, U32, S32, F16, F32
+     *   - U16 -> U8, S8, S16, U32, S32, F16, F32
+     *   - S16 -> U8, S8, U16, U32, S32, F16, F32
+     *   - U32 -> U8, S8, U16, S16, S32, F16, F32
+     *   - S32 -> U8, S8, U16, S16, U32, F16, F32
+     *   - F16 -> U8, S8, U16, S16, U32, F32
+     *   - F32 -> U8, S8, U16, S16, U32, F16
+     *
+     * @param[in]  input  The input tensor to convert. Data types supported: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32.
+     * @param[out] output The output tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
+     * @param[in]  policy Conversion policy
+     * @param[in]  shift  Value for down/up conversions. Must be 0 <= shift < 8.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift);
+    /** Set the input and output of the kernel.
+     *
+     * Valid conversions Input -> Output :
+     *
+     *   - QSYMM8_PER_CHANNEL -> QASYMM8 (ATTENTION: it is the user's responsibility to keep track of the quantization info in the TensorInfo meta-data)
+     *   - U8  -> S8, U16, S16, U32, S32, F16, F32
+     *   - U16 -> U8, S8, S16, U32, S32, F16, F32
+     *   - S16 -> U8, S8, U16, U32, S32, F16, F32
+     *   - U32 -> U8, S8, U16, S16, S32, F16, F32
+     *   - S32 -> U8, S8, U16, S16, U32, F16, F32
+     *   - F16 -> U8, S8, U16, S16, U32, F32
+     *   - F32 -> U8, S8, U16, S16, U32, F16
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           The input tensor to convert. Data types supported: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32.
+     * @param[out] output          The output tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
+     * @param[in]  policy          Conversion policy
+     * @param[in]  shift           Value for down/up conversions. Must be 0 <= shift < 8.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthConvertLayerKernel
+     *
+     * @param[in] input  Source tensor info. Data types supported: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32.
+     * @param[in] output Destination tensor info. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
+     * @param[in] policy Conversion policy
+     * @param[in] shift  Value for down/up conversions. Must be 0 <= shift < 8.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift);
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLDEPTHCONVERTKERNEL_H */

diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
index c3a40a2..8946f2a 100644
--- a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
+#include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h
new file mode 100644
index 0000000..1f7f77b
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h

@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLDEPTHTOSPACELAYERKERNEL_H
+#define ARM_COMPUTE_CLDEPTHTOSPACELAYERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the depth to space kernel */
+class CLDepthToSpaceLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLDepthToSpaceLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDepthToSpaceLayerKernel(const CLDepthToSpaceLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDepthToSpaceLayerKernel &operator=(const CLDepthToSpaceLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLDepthToSpaceLayerKernel(CLDepthToSpaceLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLDepthToSpaceLayerKernel &operator=(CLDepthToSpaceLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLDepthToSpaceLayerKernel() = default;
+    /** Initialise the kernel's inputs and output.
+     *
+     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
+     * @param[out] output      Tensor output. Data types supported: same as @p input
+     * @param[in]  block_shape Block shape value.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+    /** Initialise the kernel's inputs and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Tensor input. Supported tensor rank: 4. Data types supported: All.
+     * @param[out] output          Tensor output. Data types supported: same as @p input
+     * @param[in]  block_shape     Block shape value.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthToSpaceLayerKernel.
+     *
+     * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
+     * @param[in] output      Tensor output info. Data types supported: same as @p input
+     * @param[in] block_shape Block shape value.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;       /**< Source tensor */
+    ICLTensor       *_output;      /**< Destination tensor */
+    int32_t          _block_shape; /**< Block shape */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLDEPTHTOSPACELAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
index 7958230..c928677 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp

@@ -21,11 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -34,6 +33,7 @@
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
+#include "src/core/CL/ICLKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"

diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h
new file mode 100644
index 0000000..45b5869
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h

@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNCHWKERNEL3x3_H
+#define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNCHWKERNEL3x3_H
+
+#include "src/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to run a 3x3 depthwise convolution on a tensor when the data layout is NCHW.
+ */
+class CLDepthwiseConvolutionLayer3x3NCHWKernel : public ICLDepthwiseConvolutionLayer3x3Kernel
+{
+public:
+    /** Default constructor */
+    CLDepthwiseConvolutionLayer3x3NCHWKernel();
+    /** Initialize the function's source, destination, conv and border_size.
+     *
+     * @param[in]  input              Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [3, 3, IFM].
+     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  conv_info          Padding and stride information to use for the convolution.
+     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for QASYMM8 supported.
+     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
+     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
+     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     */
+    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+                   unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
+                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) override;
+    /** Initialize the function's source, destination, conv and border_size.
+     *
+     * @param[in]  compile_context    The compile context to be used.
+     * @param[in]  input              Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [3, 3, IFM].
+     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  conv_info          Padding and stride information to use for the convolution.
+     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for QASYMM8 supported.
+     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
+     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
+     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+                   unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
+                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) override;
+    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3NCHWKernel
+     *
+     * @param[in] input              Source tensor info. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] weights            Weights tensor info. A 3D tensor with dimensions [3, 3, IFM].
+     *                               Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] biases             Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                               Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] output             Destination tensor. Data type supported: Same as @p input.
+     * @param[in] conv_info          Padding and stride information to use for the convolution.
+     * @param[in] depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in] act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
+     * @param[in] gpu_target         (Optional) GPU target to validate the kernel for. Defaults to midgard.
+     * @param[in] dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in] output_multipliers (Optional) Output multipliers tensor info for quantized computations. In case of per-channel quantization,
+     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     * @param[in] output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
+     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                           unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), GPUTarget gpu_target = GPUTarget::MIDGARD,
+                           const Size2D &dilation = Size2D(1U, 1U), const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
+
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    unsigned int _conv_stride_x;
+    unsigned int _conv_pad_top;
+    unsigned int _conv_pad_left;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNCHWKERNEL3x3_H */

diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
index 876ef1e..0b673cc 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp

@@ -21,11 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -34,6 +33,7 @@
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
+#include "src/core/CL/ICLKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"

diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h
new file mode 100644
index 0000000..ce0bf5c
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h

@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNHWCKERNEL3x3_H
+#define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNHWCKERNEL3x3_H
+
+#include "src/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to run a 3x3 depthwise convolution on a tensor when the data layout is NHWC.
+ */
+class CLDepthwiseConvolutionLayer3x3NHWCKernel : public ICLDepthwiseConvolutionLayer3x3Kernel
+{
+public:
+    /** Default constructor */
+    CLDepthwiseConvolutionLayer3x3NHWCKernel();
+    /** Default move assignment operator. */
+    /** Initialize the function's source, destination, conv and border_size.
+     *
+     * @param[in]  input              Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [IFM, 3, 3].
+     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  conv_info          Padding and stride information to use for the convolution.
+     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
+     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
+     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
+     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     */
+    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+                   unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
+                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) override;
+    /** Initialize the function's source, destination, conv and border_size.
+     *
+     * @param[in]  compile_context    The compile context to be used.
+     * @param[in]  input              Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [IFM, 3, 3].
+     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  conv_info          Padding and stride information to use for the convolution.
+     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
+     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
+     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
+     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+                   unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
+                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) override;
+    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3NHWCKernel
+     *
+     * @param[in] input              Source tensor info. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] weights            Weights tensor info. A 3D tensor with dimensions [IFM, 3, 3].
+     *                               Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] biases             Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                               Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] output             Destination tensor info. Data type supported: Same as @p input.
+     * @param[in] conv_info          Padding and stride information to use for the convolution.
+     * @param[in] depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in] act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
+     * @param[in] dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in] output_multipliers (Optional) Output multipliers tensor info for quantized computations. In case of per-channel quantization,
+     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     * @param[in] output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
+     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                           unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
+                           const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    unsigned int _num_planes_processed_per_iteration;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNHWCKERNEL3x3_H */

diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index 4580968..748f4a3 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp

@@ -21,11 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -33,6 +32,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "src/core/CL/CLValidate.h"
+#include "src/core/CL/ICLKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"

diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
new file mode 100644
index 0000000..325f4e7
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h

@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
+#define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to run a MxN depthwise convolution. M and N are respectively the rows and columns of the filter
+    This kernel assumes that tensor for the weights is NOT reshaped (Native version) */
+class CLDepthwiseConvolutionLayerNativeKernel : public ICLKernel
+{
+public:
+    /** Default Constructor */
+    CLDepthwiseConvolutionLayerNativeKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDepthwiseConvolutionLayerNativeKernel(const CLDepthwiseConvolutionLayerNativeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDepthwiseConvolutionLayerNativeKernel &operator=(const CLDepthwiseConvolutionLayerNativeKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLDepthwiseConvolutionLayerNativeKernel(CLDepthwiseConvolutionLayerNativeKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLDepthwiseConvolutionLayerNativeKernel &operator=(CLDepthwiseConvolutionLayerNativeKernel &&) = default;
+    /** Initialize the function's source, destination and parameters
+     *
+     * @param[in]  input              Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
+     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [IFM, N, M].
+     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  dwc_weights_info   Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
+     * @param[in]  dwc_info           Depthwise convolution layer info
+     * @param[in]  conv_info          Padding and stride information to use for the convolution.
+     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
+     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
+     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     */
+    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
+                   const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
+                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
+    /** Initialize the function's source, destination and parameters
+     *
+     * @param[in]  compile_context    The compile context to be used.
+     * @param[in]  input              Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
+     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [IFM, N, M].
+     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  dwc_weights_info   Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
+     * @param[in]  dwc_info           Depthwise convolution layer info
+     * @param[in]  conv_info          Padding and stride information to use for the convolution.
+     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
+     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
+     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
+                   const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
+                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel
+     *
+     * @param[in] input              Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
+     * @param[in] weights            Weights tensor info. A 3D tensor with dimensions [IFM, N, M].
+     *                               Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     * @param[in] biases             Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                               Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] output             Destination tensor info. Data type supported: Same as @p input.
+     * @param[in] dwc_weights_info   Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
+     * @param[in] dwc_info           Depthwise convolution layer info
+     * @param[in] conv_info          Padding and stride information to use for the convolution.
+     * @param[in] depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in] dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
+     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     * @param[in] output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
+     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCWeightsKernelInfo &dwc_weights_info,
+                           const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
+                           const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    const ICLTensor *_weights;
+    const ICLTensor *_biases;
+    ICLTensor       *_output;
+    unsigned int     _depth_multiplier;
+    const ICLTensor *_output_multipliers;
+    const ICLTensor *_output_shifts;
+    bool             _is_quantized;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H */

diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp
index 0ff3c52..b10c23b 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp

@@ -21,11 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -33,6 +32,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
+#include "src/core/CL/ICLKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"

diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h
new file mode 100644
index 0000000..650fe9a
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h

@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERRESHAPEWEIGHTSKERNEL_H
+#define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERRESHAPEWEIGHTSKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to reshape the weights of depthwise convolution. */
+class CLDepthwiseConvolutionLayerReshapeWeightsKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLDepthwiseConvolutionLayerReshapeWeightsKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDepthwiseConvolutionLayerReshapeWeightsKernel(const CLDepthwiseConvolutionLayerReshapeWeightsKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDepthwiseConvolutionLayerReshapeWeightsKernel &operator=(const CLDepthwiseConvolutionLayerReshapeWeightsKernel &) = delete;
+    /** Default Move Constructor. */
+    CLDepthwiseConvolutionLayerReshapeWeightsKernel(CLDepthwiseConvolutionLayerReshapeWeightsKernel &&) = default;
+    /** Default move assignment operator */
+    CLDepthwiseConvolutionLayerReshapeWeightsKernel &operator=(CLDepthwiseConvolutionLayerReshapeWeightsKernel &&) = default;
+
+    /** Initialize the function's source and destination.
+     *
+     * @param[in]  input  The input tensor of dimension [IFM, W, H]. Data types supported: All. Data layouts supported: NHWC
+     * @param[out] output The output tensor of dimension [W*H*C0, ceil(IFM/C0)]. C0 is the number of channels read by each thread. Data types supported: same as @p weights.
+     * @param[in]  info   Depthwise convolution information to reshape the input tensor.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const DepthwiseConvolutionReshapeInfo &info);
+    /** Initialize the function's source and destination.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           The input tensor of dimension [IFM, W, H]. Data types supported: All. Data layouts supported: NHWC
+     * @param[out] output          The output tensor of dimension [W*H*C0, ceil(IFM/C0)]. C0 is the number of channels read by each thread. Data types supported: same as @p weights.
+     * @param[in]  info            Depthwise convolution information to reshape the input tensor.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const DepthwiseConvolutionReshapeInfo &info);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3NHWCKernel
+     *
+     * @param[in] input  The input tensor info of dimension [IFM, W, H]. Data types supported: All. Data layouts supported: NHWC
+     * @param[in] output The output tensor info of dimension [W*H*C0, ceil(IFM/C0)]. C0 is the number of channels read by each thread. Data types supported: same as @p weights.
+     * @param[in] info   Depthwise convolution information to reshape the input tensor.
+     *
+     * @return a Status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const DepthwiseConvolutionReshapeInfo &info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+
+    void configure_dot_product(const DepthwiseConvolutionReshapeInfo &info);
+    void configure_generic(const DepthwiseConvolutionReshapeInfo &info);
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERRESHAPEWEIGHTSKERNEL_H */

diff --git a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
index e2c49fb..3723c65 100644
--- a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h"
+#include "src/core/CL/kernels/CLDequantizationLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLDequantizationLayerKernel.h b/src/core/CL/kernels/CLDequantizationLayerKernel.h
new file mode 100644
index 0000000..5579b5b
--- /dev/null
+++ b/src/core/CL/kernels/CLDequantizationLayerKernel.h

@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLDEQUANTIZATIONLAYERKERNEL_H
+#define ARM_COMPUTE_CLDEQUANTIZATIONLAYERKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the dequantization layer kernel. */
+class CLDequantizationLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLDequantizationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDequantizationLayerKernel(const CLDequantizationLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDequantizationLayerKernel &operator=(const CLDequantizationLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    CLDequantizationLayerKernel(CLDequantizationLayerKernel &&) = default;
+    /** Default move assignment operator */
+    CLDequantizationLayerKernel &operator=(CLDequantizationLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLDequantizationLayerKernel() = default;
+    /** Set the input, output, min and max.
+     *
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     * @param[out] output Destination tensor. Data types supported: F16/F32.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+    /** Set the input, output, min and max.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     * @param[out] output          Destination tensor. Data types supported: F16/F32.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLDequantizationLayerKernel
+     *
+     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     * @param[in] output Output tensor info. Data types supported: F16/F32.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLDEQUANTIZATIONLAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLDerivativeKernel.cpp b/src/core/CL/kernels/CLDerivativeKernel.cpp
index 659a7cb..5ff1136 100644
--- a/src/core/CL/kernels/CLDerivativeKernel.cpp
+++ b/src/core/CL/kernels/CLDerivativeKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDerivativeKernel.h"
+#include "src/core/CL/kernels/CLDerivativeKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLDerivativeKernel.h b/src/core/CL/kernels/CLDerivativeKernel.h
new file mode 100644
index 0000000..14dd05d
--- /dev/null
+++ b/src/core/CL/kernels/CLDerivativeKernel.h

@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLDERIVATIVEKERNEL_H
+#define ARM_COMPUTE_CLDERIVATIVEKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the derivative kernel. */
+class CLDerivativeKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLDerivativeKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDerivativeKernel(const CLDerivativeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDerivativeKernel &operator=(const CLDerivativeKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLDerivativeKernel(CLDerivativeKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLDerivativeKernel &operator=(CLDerivativeKernel &&) = default;
+    /** Default destructor */
+    ~CLDerivativeKernel() = default;
+    /** Initialise the kernel's sources, destination and border
+     *
+     * @note At least one of output_x or output_y must be set
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+    /** Initialise the kernel's sources, destination and border
+     *
+     * @note At least one of output_x or output_y must be set
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input;            /**< Input tensor */
+    ICLTensor       *_output_x;         /**< Output tensor - Derivate along the X direction */
+    ICLTensor       *_output_y;         /**< Output tensor - Derivate along the Y direction */
+    bool             _run_derivative_x; /**< Do we need to run Derivative X ? */
+    bool             _run_derivative_y; /**< Do we need to run Derivative Y ? */
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLDERIVATIVEKERNEL_H */

diff --git a/src/core/CL/kernels/CLDilateKernel.cpp b/src/core/CL/kernels/CLDilateKernel.cpp
index 1e59c34..cac5bc1 100644
--- a/src/core/CL/kernels/CLDilateKernel.cpp
+++ b/src/core/CL/kernels/CLDilateKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDilateKernel.h"
+#include "src/core/CL/kernels/CLDilateKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLDilateKernel.h b/src/core/CL/kernels/CLDilateKernel.h
new file mode 100644
index 0000000..591ec8c
--- /dev/null
+++ b/src/core/CL/kernels/CLDilateKernel.h

@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLDILATEKERNEL_H
+#define ARM_COMPUTE_CLDILATEKERNEL_H
+
+#include "src/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the dilate kernel.
+ *
+ */
+class CLDilateKernel : public ICLSimple2DKernel
+{
+public:
+    /**Initialise the kernel's input and output.
+     *
+     * @param[in]  input            An input tensor. Data types supported: U8
+     * @param[out] output           The output tensor. Data types supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+    /**Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  input            An input tensor. Data types supported: U8
+     * @param[out] output           The output tensor. Data types supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLDILATEKERNEL_H */

diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index 161b221..a642eab 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
+#include "src/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.h b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.h
new file mode 100644
index 0000000..5cd674f
--- /dev/null
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.h

@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYERKERNEL_H
+#define ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the  direct convolution kernel.
+ */
+class CLDirectConvolutionLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLDirectConvolutionLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDirectConvolutionLayerKernel(const CLDirectConvolutionLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDirectConvolutionLayerKernel &operator=(const CLDirectConvolutionLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLDirectConvolutionLayerKernel(CLDirectConvolutionLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLDirectConvolutionLayerKernel &operator=(CLDirectConvolutionLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLDirectConvolutionLayerKernel() = default;
+    /** Set the input, weights, biases and output tensors.
+     *
+     * @note: DirectConvolution only works in the following configurations:
+     *        1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
+     *        3x3 convolution with stride_x = 1/2, stride_y = 1/2
+     *        5x5 convolution with stride_x = 1/2, stride_y = 1/2
+     *        9x9 convolution with stride_x = 1/2, stride_y = 1/2
+     *
+     * @param[in]  input     The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
+     *                       while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                       The 3rd dimension must be the same as the input's volume 3rd dimension.
+     *                       Data type supported:Same as @p input.
+     * @param[in]  biases    Biases tensor. Biases are 1D tensor with dimension [OFM].
+     *                       Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
+     * @param[out] output    Output tensor.
+     *                       The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
+     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     */
+    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
+    /** Set the input, weights, biases and output tensors.
+     *
+     * @note: DirectConvolution only works in the following configurations:
+     *        1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
+     *        3x3 convolution with stride_x = 1/2, stride_y = 1/2
+     *        5x5 convolution with stride_x = 1/2, stride_y = 1/2
+     *        9x9 convolution with stride_x = 1/2, stride_y = 1/2
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
+     *                             while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+     * @param[in]  weights         Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                             The 3rd dimension must be the same as the input's volume 3rd dimension.
+     *                             Data type supported:Same as @p input.
+     * @param[in]  biases          Biases tensor. Biases are 1D tensor with dimension [OFM].
+     *                             Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
+     * @param[out] output          Output tensor.
+     *                             The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
+     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLDirectConvolutionLayerKernel
+     *
+     * @param[in] input     The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
+     *                      while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+     * @param[in] weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                      The 3rd dimension must be the same as the input's volume 3rd dimension.
+     *                      Data type supported:Same as @p input.
+     * @param[in] biases    Biases tensor. Biases are 1D tensor with dimension [OFM].
+     *                      Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type.
+     * @param[in] output    Output tensor.
+     *                      The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
+     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in] target    Target GPU architecture.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, const GPUTarget target);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+public:
+    const ICLTensor *_input;
+    const ICLTensor *_biases;
+    const ICLTensor *_weights;
+    ICLTensor       *_output;
+    DataLayout       _data_layout;
+    BorderSize       _border_size;
+    int              _conv_stride_x;
+    int              _conv_stride_y;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp b/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp
index bff0db0..38a7f1b 100644
--- a/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp
+++ b/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h"
+#include "src/core/CL/kernels/CLElementWiseUnaryLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.h b/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.h
new file mode 100644
index 0000000..95b5872
--- /dev/null
+++ b/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.h

@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLELEMENTWISEUNARYLAYERKERNEL_H
+#define ARM_COMPUTE_CLELEMENTWISEUNARYLAYERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLSimpleKernel.h"
+
+namespace arm_compute
+{
+/** Interface for the elementwise unary operator */
+class CLElementWiseUnaryLayerKernel : public ICLKernel
+{
+public:
+    /** Initialise the kernel's inputs, output.
+     *
+     * @param[in]  input  First tensor input info. Data types supported: F16/F32.
+     * @param[out] output Output tensor info. Data types supported: Same as @p input.
+     * @param[in]  op     Element wise unary operation to perform.
+     */
+    void configure(const ITensorInfo *input, ITensorInfo *output, const ElementWiseUnary &op);
+    /** Initialise the kernel's inputs, output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           First tensor input info. Data types supported: F16/F32.
+     * @param[out] output          Output tensor info. Data types supported: Same as @p input.
+     * @param[in]  op              Element wise unary operation to perform.
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const ElementWiseUnary &op);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLElementWiseUnaryLayerKernel
+     *
+     * @param[in] input  First tensor input info. Data types supported: F16/F32.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input.
+     * @param[in] op     Element wise unary operation to perform.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ElementWiseUnary &op);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLELEMENTWISEUNARYLAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLElementwiseOperationKernel.cpp b/src/core/CL/kernels/CLElementwiseOperationKernel.cpp
index da28f3d..896ee11 100644
--- a/src/core/CL/kernels/CLElementwiseOperationKernel.cpp
+++ b/src/core/CL/kernels/CLElementwiseOperationKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
+#include "src/core/CL/kernels/CLElementwiseOperationKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLElementwiseOperationKernel.h b/src/core/CL/kernels/CLElementwiseOperationKernel.h
new file mode 100644
index 0000000..75030cf
--- /dev/null
+++ b/src/core/CL/kernels/CLElementwiseOperationKernel.h

@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLELEMENTWISEOPERATIONKERNEL_H
+#define ARM_COMPUTE_CLELEMENTWISEOPERATIONKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for an element-wise operation kernel
+ *
+ * Element-wise operation is computed by:
+ * @f[ output(x,y) = OP(input1(x,y), input2(x,y))@f]
+ *
+ */
+class CLElementwiseOperationKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLElementwiseOperationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLElementwiseOperationKernel(const CLElementwiseOperationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLElementwiseOperationKernel &operator=(const CLElementwiseOperationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLElementwiseOperationKernel(CLElementwiseOperationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLElementwiseOperationKernel &operator=(CLElementwiseOperationKernel &&) = default;
+    /** Default destructor */
+    ~CLElementwiseOperationKernel() = default;
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+protected:
+    /** The name of the operation */
+    virtual std::string name() = 0;
+
+    /** Initialise the kernel's output.
+     *
+     * @param[in] input1 First tensor input info. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     *
+     * @return a pair of Status and Window
+     */
+    virtual std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) = 0;
+
+    /** Generate the build options for the specific kernel
+     *
+     * @reutrn a CLBuildOptions struct
+     */
+    virtual CLBuildOptions generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) = 0;
+
+    /** Generate the identifier for tuning
+     *
+     * @reutrn a string
+     */
+    virtual std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output) = 0;
+
+    /** Commmon configure function for element-wise operators with no additional options (e.g., Div, Min, Max, SquaredDiff)
+     *
+     */
+    void configure_common(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
+    /** Commmon configure function for element-wise operators with no additional options (e.g., Div, Min, Max, SquaredDiff)
+     *
+     */
+    void configure_common(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
+
+    ActivationLayerInfo _act_info;
+
+private:
+    const ITensorInfo *_input1; /**< Source tensor info 1 */
+    const ITensorInfo *_input2; /**< Source tensor info 2 */
+    ITensorInfo       *_output; /**< Destination tensor info */
+};
+
+/** Addition operation */
+class CLSaturatedArithmeticOperationKernel : public CLElementwiseOperationKernel
+{
+public:
+    CLSaturatedArithmeticOperationKernel()
+        : CLElementwiseOperationKernel(), _policy(), _op()
+    {
+    }
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel
+     *
+     * @param[in] op       Arithmetic operation to be executed.
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output   Output tensor info. Data types supported: Same as @p input1.
+     * @param[in] policy   Policy to use to handle overflow.
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ConvertPolicy &policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel
+     *
+     * @param[in] compile_context The compile context to be used.
+     * @param[in] op              Arithmetic operation to be executed.
+     * @param[in] input1          First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
+     * @param[in] input2          Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output          Output tensor info. Data types supported: Same as @p input1.
+     * @param[in] policy          Policy to use to handle overflow.
+     * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ConvertPolicy &policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel
+     *
+     * @param[in] op       Arithmetic operation to be executed.
+     * @param[in] input1   First tensor input info info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
+     * @param[in] input2   Second tensor input info info. Data types supported: Same as @p input1.
+     * @param[in] output   Output tensor info info. Data types supported: Same as @p input1.
+     * @param[in] policy   Policy to use to handle overflow.
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+     *
+     * @return a Status
+     */
+    static Status validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ConvertPolicy &policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+protected:
+    // Inherited methods overridden:
+    std::string name() override;
+    std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) override;
+    CLBuildOptions generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
+    std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output) override;
+
+private:
+    ConvertPolicy       _policy;
+    ArithmeticOperation _op;
+};
+
+class CLArithmeticOperationKernel : public CLElementwiseOperationKernel
+{
+public:
+    CLArithmeticOperationKernel()
+        : CLElementwiseOperationKernel(), _op()
+    {
+    }
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel
+     *
+     * @param[in] op       Arithmetic operation to be executed.
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output   Output tensor info. Data types supported: Same as @p input1.
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel
+     *
+     * @param[in] compile_context The compile context to be used.
+     * @param[in] op              Arithmetic operation to be executed.
+     * @param[in] input1          First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
+     * @param[in] input2          Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output          Output tensor info. Data types supported: Same as @p input1.
+     * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel
+     *
+     * @param[in] op       Arithmetic operation to be executed.
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output   Output tensor info. Data types supported: Same as @p input1.
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+     *
+     * @return a Status
+     */
+    static Status validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+protected:
+    // Inherited methods overridden:
+    std::string name() override;
+    std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) override;
+    CLBuildOptions generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
+    std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output) override;
+
+private:
+    ArithmeticOperation _op;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLELEMENTWISEOPERATIONKERNEL_H */

diff --git a/src/core/CL/kernels/CLErodeKernel.cpp b/src/core/CL/kernels/CLErodeKernel.cpp
index 29a3297..f6d98a5 100644
--- a/src/core/CL/kernels/CLErodeKernel.cpp
+++ b/src/core/CL/kernels/CLErodeKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLErodeKernel.h"
+#include "src/core/CL/kernels/CLErodeKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLErodeKernel.h b/src/core/CL/kernels/CLErodeKernel.h
new file mode 100644
index 0000000..4da97ae
--- /dev/null
+++ b/src/core/CL/kernels/CLErodeKernel.h

@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLERODEKERNEL_H
+#define ARM_COMPUTE_CLERODEKERNEL_H
+
+#include "src/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the erode kernel.
+ *
+ */
+class CLErodeKernel : public ICLSimple2DKernel
+{
+public:
+    /**Initialise the kernel's input and output.
+     *
+     * @param[in]  input            An input tensor. Data types supported: U8
+     * @param[out] output           The output tensor. Data types supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+    /**Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  input            An input tensor. Data types supported: U8
+     * @param[out] output           The output tensor. Data types supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLERODEKERNEL_H */

diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
index 0478f55..922e50a 100644
--- a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
+++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h"
+#include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.h b/src/core/CL/kernels/CLFFTDigitReverseKernel.h
new file mode 100644
index 0000000..2e2f1bd
--- /dev/null
+++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.h

@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H
+#define ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ICLTensor;
+
+/** Interface for the digit reverse operation kernel. */
+class CLFFTDigitReverseKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLFFTDigitReverseKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFFTDigitReverseKernel(const CLFFTDigitReverseKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFFTDigitReverseKernel &operator=(const CLFFTDigitReverseKernel &) = delete;
+    /** Default Move Constructor. */
+    CLFFTDigitReverseKernel(CLFFTDigitReverseKernel &&) = default;
+    /** Default move assignment operator */
+    CLFFTDigitReverseKernel &operator=(CLFFTDigitReverseKernel &&) = default;
+    /** Default destructor */
+    ~CLFFTDigitReverseKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: F32.
+     * @param[out] output Destination tensor. Data type supported: same as @p input
+     * @param[in]  idx    Digit reverse index tensor. Data type supported: U32
+     * @param[in]  config Kernel configuration.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
+    /** Set the input and output tensors.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: F32.
+     * @param[out] output          Destination tensor. Data type supported: same as @p input
+     * @param[in]  idx             Digit reverse index tensor. Data type supported: U32
+     * @param[in]  config          Kernel configuration.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLFFTDigitReverseKernel
+     *
+     * @param[in] input  Source tensor info. Data types supported: F32.
+     * @param[in] output Destination tensor info. Data type supported: same as @p input
+     * @param[in] idx    Digit reverse index tensor info. Data type supported: U32
+     * @param[in] config Kernel configuration.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    const ICLTensor *_idx;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H */

diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
index 7b17a22..0f06640 100644
--- a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
+++ b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h"
+#include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.h b/src/core/CL/kernels/CLFFTRadixStageKernel.h
new file mode 100644
index 0000000..c3cc510
--- /dev/null
+++ b/src/core/CL/kernels/CLFFTRadixStageKernel.h

@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H
+#define ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include <set>
+
+namespace arm_compute
+{
+// Forward declarations
+class ICLTensor;
+
+/** Interface for the FFT radix stage kernel. */
+class CLFFTRadixStageKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLFFTRadixStageKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFFTRadixStageKernel(const CLFFTRadixStageKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFFTRadixStageKernel &operator=(const CLFFTRadixStageKernel &) = delete;
+    /** Default Move Constructor. */
+    CLFFTRadixStageKernel(CLFFTRadixStageKernel &&) = default;
+    /** Default move assignment operator */
+    CLFFTRadixStageKernel &operator=(CLFFTRadixStageKernel &&) = default;
+    /** Default destructor */
+    ~CLFFTRadixStageKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @note If the output tensor is nullptr, the FFT will be performed in-place
+     *
+     * @param[in,out] input  Source tensor. Data types supported: F32.
+     * @param[out]    output Destination tensor. Can be nullptr. Data type supported: same as @p input
+     * @param[in]     config FFT descriptor metadata.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config);
+    /** Set the input and output tensors.
+     *
+     * @note If the output tensor is nullptr, the FFT will be performed in-place
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in,out] input           Source tensor. Data types supported: F32.
+     * @param[out]    output          Destination tensor. Can be nullptr. Data type supported: same as @p input
+     * @param[in]     config          FFT descriptor metadata.
+     */
+    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLFFTRadixStageKernel
+     *
+     * @param[in] input  Source tensor info. Data types supported: F32.
+     * @param[in] output Destination tensor info. Can be nullptr. Data type supported: same as @p input
+     * @param[in] config FFT descriptor metadata.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config);
+    /** Returns the radix that are support by the FFT kernel
+     *
+     * @return A set of supported radix
+     */
+    static std::set<unsigned int> supported_radix();
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    ICLTensor *_input;
+    ICLTensor *_output;
+    bool       _run_in_place;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H */

diff --git a/src/core/CL/kernels/CLFFTScaleKernel.cpp b/src/core/CL/kernels/CLFFTScaleKernel.cpp
index 49fcbb6..4dbe8d2 100644
--- a/src/core/CL/kernels/CLFFTScaleKernel.cpp
+++ b/src/core/CL/kernels/CLFFTScaleKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLFFTScaleKernel.h"
+#include "src/core/CL/kernels/CLFFTScaleKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLFFTScaleKernel.h b/src/core/CL/kernels/CLFFTScaleKernel.h
new file mode 100644
index 0000000..cb007e5
--- /dev/null
+++ b/src/core/CL/kernels/CLFFTScaleKernel.h

@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLFFTSCALEKERNEL_H
+#define ARM_COMPUTE_CLFFTSCALEKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ICLTensor;
+
+/** Interface for the inverse fft scale kernel. */
+class CLFFTScaleKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLFFTScaleKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFFTScaleKernel(const CLFFTScaleKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFFTScaleKernel &operator=(const CLFFTScaleKernel &) = delete;
+    /** Default Move Constructor. */
+    CLFFTScaleKernel(CLFFTScaleKernel &&) = default;
+    /** Default move assignment operator */
+    CLFFTScaleKernel &operator=(CLFFTScaleKernel &&) = default;
+    /** Default destructor */
+    ~CLFFTScaleKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in,out] input  Source tensor. Data types supported: F32.
+     * @param[out]    output Destination tensor. Data type supported: same as @p input
+     * @param[in]     config Kernel configuration
+     */
+    void configure(ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config);
+    /** Set the input and output tensors.
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in,out] input           Source tensor. Data types supported: F32.
+     * @param[out]    output          Destination tensor. Data type supported: same as @p input
+     * @param[in]     config          Kernel configuration
+     */
+    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLFFTScaleKernel
+     *
+     * @param[in] input  Source tensor info. Data types supported: F32.
+     * @param[in] output Destination tensor info. Data type supported: same as @p input
+     * @param[in] config Kernel configuration
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const FFTScaleKernelInfo &config);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    ICLTensor *_input;
+    ICLTensor *_output;
+    bool       _run_in_place;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLFFTSCALEKERNEL_H */

diff --git a/src/core/CL/kernels/CLFastCornersKernel.cpp b/src/core/CL/kernels/CLFastCornersKernel.cpp
index ebdfd27..7481fd1 100644
--- a/src/core/CL/kernels/CLFastCornersKernel.cpp
+++ b/src/core/CL/kernels/CLFastCornersKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h"
+#include "src/core/CL/kernels/CLFastCornersKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLFastCornersKernel.h b/src/core/CL/kernels/CLFastCornersKernel.h
new file mode 100644
index 0000000..0c1b564
--- /dev/null
+++ b/src/core/CL/kernels/CLFastCornersKernel.h

@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLFASTCORNERSKERNEL_H
+#define ARM_COMPUTE_CLFASTCORNERSKERNEL_H
+
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+#include <cstdint>
+
+namespace cl
+{
+class Buffer;
+}
+
+namespace arm_compute
+{
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** CL kernel to perform fast corners */
+class CLFastCornersKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLFastCornersKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFastCornersKernel(const CLFastCornersKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFastCornersKernel &operator=(const CLFastCornersKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLFastCornersKernel(CLFastCornersKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLFastCornersKernel &operator=(CLFastCornersKernel &&) = default;
+    /** Default destructor */
+    ~CLFastCornersKernel() = default;
+
+    /** Initialise the kernel.
+     *
+     * @param[in]  input               Source image. Data types supported: U8.
+     * @param[out] output              Output image. Data types supported: U8.
+     * @param[in]  threshold           Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
+     * @param[in]  non_max_suppression True if non-maxima suppresion is applied, false otherwise.
+     * @param[in]  border_mode         Strategy to use for borders.
+     */
+    void configure(const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode);
+    /** Initialise the kernel.
+     *
+     * @param[in]  compile_context     The compile context to be used.
+     * @param[in]  input               Source image. Data types supported: U8.
+     * @param[out] output              Output image. Data types supported: U8.
+     * @param[in]  threshold           Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
+     * @param[in]  non_max_suppression True if non-maxima suppresion is applied, false otherwise.
+     * @param[in]  border_mode         Strategy to use for borders.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode);
+
+    // Inherited methods overridden
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLImage *_input;
+    ICLImage       *_output;
+};
+
+/** CL kernel to copy keypoints information to ICLKeyPointArray and counts the number of key points */
+class CLCopyToArrayKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLCopyToArrayKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLCopyToArrayKernel(const CLCopyToArrayKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLCopyToArrayKernel &operator=(const CLCopyToArrayKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLCopyToArrayKernel(CLCopyToArrayKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLCopyToArrayKernel &operator=(CLCopyToArrayKernel &&) = default;
+    /** Default destructor */
+    ~CLCopyToArrayKernel() = default;
+
+    /** Initialise the kernel.
+     *
+     * @param[in]  input         Source image. Data types supported: U8.
+     * @param[in]  update_number Flag to indicate whether we need to update the number of corners
+     * @param[out] corners       Array of keypoints to store the results.
+     * @param[out] num_buffers   Number of keypoints to store the results.
+     */
+    void configure(const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers);
+    /** Initialise the kernel.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source image. Data types supported: U8.
+     * @param[in]  update_number   Flag to indicate whether we need to update the number of corners
+     * @param[out] corners         Array of keypoints to store the results.
+     * @param[out] num_buffers     Number of keypoints to store the results.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLImage   *_input;      /**< source image */
+    ICLKeyPointArray *_corners;    /**< destination array */
+    cl::Buffer       *_num_buffer; /**< CL memory to record number of key points in the array */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLFASTCORNERSKERNEL_H */

diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index e92619a..5d77c29 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLFillBorderKernel.h b/src/core/CL/kernels/CLFillBorderKernel.h
new file mode 100644
index 0000000..7951f48
--- /dev/null
+++ b/src/core/CL/kernels/CLFillBorderKernel.h

@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLFILLBORDERKERNEL_H
+#define ARM_COMPUTE_CLFILLBORDERKERNEL_H
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for filling the border of a kernel */
+class CLFillBorderKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLFillBorderKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFillBorderKernel(const CLFillBorderKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFillBorderKernel &operator=(const CLFillBorderKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLFillBorderKernel(CLFillBorderKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLFillBorderKernel &operator=(CLFillBorderKernel &&) = default;
+    /** Default destructor */
+    ~CLFillBorderKernel() = default;
+
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]     compile_context       The compile context to be used.
+     * @param[in,out] tensor                Tensor to process Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
+     * @param[in]     border_size           Size of the border to fill in elements.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in,out] tensor                Tensor to process Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
+     * @param[in]     border_size           Size of the border to fill in elements.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]     compile_context       The compile context to be used.
+     * @param[in,out] tensor                Tensor to process Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
+     * @param[in]     border_size           Size of the border to fill in elements.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+
+    /** Function to set the constant value on fill border kernel depending on type.
+     *
+     * @param[in] idx                   Index of the kernel argument to set.
+     * @param[in] constant_border_value Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    template <class T>
+    void set_constant_border(unsigned int idx, const PixelValue &constant_border_value);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    bool is_parallelisable() const override;
+
+private:
+    ICLTensor *_tensor;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLFILLBORDERKERNEL_H */

diff --git a/src/core/CL/kernels/CLFlattenLayerKernel.cpp b/src/core/CL/kernels/CLFlattenLayerKernel.cpp
index 590fcee..b3f84b6 100644
--- a/src/core/CL/kernels/CLFlattenLayerKernel.cpp
+++ b/src/core/CL/kernels/CLFlattenLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
+#include "src/core/CL/kernels/CLFlattenLayerKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"

diff --git a/src/core/CL/kernels/CLFlattenLayerKernel.h b/src/core/CL/kernels/CLFlattenLayerKernel.h
new file mode 100644
index 0000000..2471cf2
--- /dev/null
+++ b/src/core/CL/kernels/CLFlattenLayerKernel.h

@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLFLATTENLAYERKERNEL_H
+#define ARM_COMPUTE_CLFLATTENLAYERKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL interface for the flatten kernel.*/
+class CLFlattenLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLFlattenLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFlattenLayerKernel(const CLFlattenLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFlattenLayerKernel &operator=(const CLFlattenLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLFlattenLayerKernel(CLFlattenLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLFlattenLayerKernel &operator=(CLFlattenLayerKernel &&) = default;
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input  First input tensor to flatten with at least 3 dimensions.
+     *                    The dimensions above the third will be interpreted as batches. Data types supported: All.
+     * @param[out] output Output tensor with shape [w*h*d, input_batches] where:
+     *                    w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           First input tensor to flatten with at least 3 dimensions.
+     *                             The dimensions above the third will be interpreted as batches. Data types supported: All.
+     * @param[out] output          Output tensor with shape [w*h*d, input_batches] where:
+     *                    w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLFlattenLayerKernel
+     *
+     * @param[in]  input  First input tensor to flatten with at least 3 dimensions.
+     *                    The dimensions above the third will be interpreted as batches. Data types supported: All.
+     * @param[out] output Output tensor with shape [w*h*d, input_batches] where:
+     *                    w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+public:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLFLATTENLAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLFloorKernel.cpp b/src/core/CL/kernels/CLFloorKernel.cpp
index 8884f3f..2af0089 100644
--- a/src/core/CL/kernels/CLFloorKernel.cpp
+++ b/src/core/CL/kernels/CLFloorKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLFloorKernel.h"
+#include "src/core/CL/kernels/CLFloorKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLFloorKernel.h b/src/core/CL/kernels/CLFloorKernel.h
new file mode 100644
index 0000000..f563514
--- /dev/null
+++ b/src/core/CL/kernels/CLFloorKernel.h

@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLFLOORKERNEL_H
+#define ARM_COMPUTE_CLFLOORKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform a floor operation */
+class CLFloorKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLFloorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFloorKernel(const CLFloorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFloorKernel &operator=(const CLFloorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLFloorKernel(CLFloorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLFloorKernel &operator=(CLFloorKernel &&) = default;
+    /** Default destructor */
+    ~CLFloorKernel() = default;
+    /** Set the source, destination of the kernel
+     *
+     * @param[in]  input  Source tensor. Data type supported: F16/F32.
+     * @param[out] output Destination tensor. Same as @p input
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+
+    /** Set the source, destination of the kernel
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data type supported: F16/F32.
+     * @param[out] output          Destination tensor. Same as @p input
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLFloorKernel
+     *
+     * @param[in] input  Source tensor info. Data type supported: F16/F32.
+     * @param[in] output Destination tensor info. Same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLFLOORKERNEL_H */

diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
index 3572319..2116239 100644
--- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
+#include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h
new file mode 100644
index 0000000..78b1e74
--- /dev/null
+++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h

@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLFUSEBATCHNORMALIZATIONKERNEL_H
+#define ARM_COMPUTE_CLFUSEBATCHNORMALIZATIONKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ICLTensor;
+
+/** OpenCL kernel to fuse the batch normalization node to a preceding convolution node */
+class CLFuseBatchNormalizationKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLFuseBatchNormalizationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFuseBatchNormalizationKernel(const CLFuseBatchNormalizationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFuseBatchNormalizationKernel &operator=(const CLFuseBatchNormalizationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLFuseBatchNormalizationKernel(CLFuseBatchNormalizationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLFuseBatchNormalizationKernel &operator=(CLFuseBatchNormalizationKernel &&) = default;
+    /** Default destructor */
+    ~CLFuseBatchNormalizationKernel() = default;
+    /** Set the source, destination of the kernel
+     *
+     * @param[in]  input_weights Input weights tensor for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
+     * @param[in]  bn_mean       Batch normalization layer mean tensor. Same as @p input_weights
+     * @param[in]  bn_var        Batch normalization layer variance tensor. Same as @p input_weights
+     * @param[out] fused_weights Output fused weights tensor. It can be a nullptr in case of in-place computation. Same as @p input_weights
+     * @param[out] fused_bias    Output fused bias tensor. It can be a nullptr in case of in-place computation and input_bias != nullptr. Same as @p input_weights
+     * @param[in]  input_bias    (Optional) Input bias tensor for convolution or depthwise convolution layer. It can be a nullptr in case the bias tensor is not required. Same as @p input_weights
+     * @param[in]  bn_beta       (Optional) Batch normalization layer beta tensor. It can be a nullptr in case the beta tensor is not required. Same as @p input_weights
+     *                           @note if nullptr, bn_beta is set to 0.0
+     * @param[in]  bn_gamma      (Optional) Batch normalization layer gamma tensor. It can be a nullptr in case the gamma tensor is not required. Same as @p input_weights
+     *                           @note if nullptr, bn_gamma is set to 1.0
+     * @param[in]  epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
+     * @param[in]  fbn_type      (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
+     */
+    void configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
+                   const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
+                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    /** Set the source, destination of the kernel
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input_weights   Input weights tensor for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
+     * @param[in]  bn_mean         Batch normalization layer mean tensor. Same as @p input_weights
+     * @param[in]  bn_var          Batch normalization layer variance tensor. Same as @p input_weights
+     * @param[out] fused_weights   Output fused weights tensor. It can be a nullptr in case of in-place computation. Same as @p input_weights
+     * @param[out] fused_bias      Output fused bias tensor. It can be a nullptr in case of in-place computation and input_bias != nullptr. Same as @p input_weights
+     * @param[in]  input_bias      (Optional) Input bias tensor for convolution or depthwise convolution layer. It can be a nullptr in case the bias tensor is not required. Same as @p input_weights
+     * @param[in]  bn_beta         (Optional) Batch normalization layer beta tensor. It can be a nullptr in case the beta tensor is not required. Same as @p input_weights
+     *                             @note if nullptr, bn_beta is set to 0.0
+     * @param[in]  bn_gamma        (Optional) Batch normalization layer gamma tensor. It can be a nullptr in case the gamma tensor is not required. Same as @p input_weights
+     *                             @note if nullptr, bn_gamma is set to 1.0
+     * @param[in]  epsilon         (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
+     * @param[in]  fbn_type        (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
+                   const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
+                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLFuseBatchNormalizationKernel
+     *
+     * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
+     * @param[in] bn_mean       Batch normalization layer mean tensor info. Same as @p input_weights
+     * @param[in] bn_var        Batch normalization layer variance tensor info. Same as @p input_weights
+     * @param[in] fused_weights Output fused weights tensor info. It can be a nullptr in case of in-place computation. Same as @p input_weights
+     * @param[in] fused_bias    Output fused bias tensor info. It can be a nullptr in case of in-place computation and input_bias != nullptr. Same as @p input_weights
+     * @param[in] input_bias    (Optional) Input bias tensor info for convolution or depthwise convolution layer. It can be a nullptr in case the bias tensor is not required. Same as @p input_weights
+     * @param[in] bn_beta       (Optional) Batch normalization layer beta tensor info. It can be a nullptr in case the beta tensor is not required. Same as @p input_weights
+     *                          @note if nullptr, bn_beta is set to 0.0
+     * @param[in] bn_gamma      (Optional) Batch normalization layer gamma tensor info. It can be a nullptr in case the gamma tensor is not required. Same as @p input_weights
+     *                          @note if nullptr, bn_gamma is set to 1.0
+     * @param[in] epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
+     * @param[in] fbn_type      (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
+                           const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
+                           const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
+                           float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input_weights;
+    const ICLTensor *_input_bias;
+    const ICLTensor *_bn_mean;
+    const ICLTensor *_bn_var;
+    const ICLTensor *_bn_gamma;
+    const ICLTensor *_bn_beta;
+    ICLTensor       *_fused_weights;
+    ICLTensor       *_fused_bias;
+    float            _epsilon;
+    bool             _run_in_place_weights;
+    bool             _run_in_place_bias;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLFUSEBATCHNORMALIZATIONKERNEL_H */

diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
index af7755b..1f89865 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h
new file mode 100644
index 0000000..125f0c6
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h

@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H
+#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to multiply matrices with QASYMM8/QASYMM8_SIGNED data type */
+class CLGEMMLowpMatrixMultiplyNativeKernel : public ICLKernel
+{
+public:
+    /** Default Constructor */
+    CLGEMMLowpMatrixMultiplyNativeKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMLowpMatrixMultiplyNativeKernel(const CLGEMMLowpMatrixMultiplyNativeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMLowpMatrixMultiplyNativeKernel &operator=(const CLGEMMLowpMatrixMultiplyNativeKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpMatrixMultiplyNativeKernel(CLGEMMLowpMatrixMultiplyNativeKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpMatrixMultiplyNativeKernel &operator=(CLGEMMLowpMatrixMultiplyNativeKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input0    Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  input1    Input tensor containing the RHS matrix. Data type supported: same as @p input0
+     * @param[out] output    Output tensor to store the result of matrix multiplication. Data type supported: S32
+     * @param[in]  lhs_info  LHS matrix information used to retrieve the number of rows to be processed by each thread
+     *                       lhs_info.m0: 2,3,4,5,6,7,8
+     *                       lhs_info.k0: 2,3,4,8,16
+     * @param[in]  rhs_info  RHS matrix information used to retrieve the number of columns to be processed by each thread
+     *                       rhs_info.n0: 2,3,4,8,16
+     *                       rhs_info.k0: same as lhs_info.k0
+     * @param[in]  gemm_info GEMM information used to retrieve the original dimensions of the input matrices
+     */
+    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input0          Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  input1          Input tensor containing the RHS matrix. Data type supported: same as @p input0
+     * @param[out] output          Output tensor to store the result of matrix multiplication. Data type supported: S32
+     * @param[in]  lhs_info        LHS matrix information used to retrieve the number of rows to be processed by each thread
+     *                             lhs_info.m0: 2,3,4,5,6,7,8
+     *                             lhs_info.k0: 2,3,4,8,16
+     * @param[in]  rhs_info        RHS matrix information used to retrieve the number of columns to be processed by each thread
+     *                             rhs_info.n0: 2,3,4,8,16
+     *                             rhs_info.k0: same as lhs_info.k0
+     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMReshapeInfo &gemm_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyNativeKernel
+     *
+     * @param[in] input0    Input tensor info for the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in] input1    Input tensor info for the RHS matrix. Data type supported: same as @p input0
+     * @param[in] output    Output tensor info. Data type supported: S32
+     * @param[in] lhs_info  LHS matrix information used to retrieve the number of rows to be processed by each thread
+     *                      lhs_info.m0: 2,3,4,5,6,7,8
+     *                      lhs_info.k0: 2,3,4,8,16
+     * @param[in] rhs_info  RHS matrix information used to retrieve the number of columns to be processed by each thread
+     *                      rhs_info.n0: 2,3,4,8,16
+     *                      rhs_info.k0: same as lhs_info.k0
+     * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+                           const GEMMReshapeInfo &gemm_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input0;
+    const ICLTensor *_input1;
+    ICLTensor       *_output;
+    bool             _slide_matrix_b;
+    bool             _reinterpret_input_as_3d;
+    bool             _reinterpret_output_as_3d;
+    bool             _use_dummy_work_items;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H*/

diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
index 713d822..ded4b29 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h
new file mode 100644
index 0000000..100100b
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h

@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H
+#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to multiply matrices when both the input matrices LHS (input0) and RHS (input1) have been reshaped
+ *
+ * @note The input matrices @p input0 and @p input1 must be reshaped through @ref CLGEMMReshapeLHSMatrixKernel and  @ref CLGEMMReshapeRHSMatrixKernel
+ */
+class CLGEMMLowpMatrixMultiplyReshapedKernel : public ICLKernel
+{
+public:
+    /** Default Constructor */
+    CLGEMMLowpMatrixMultiplyReshapedKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMLowpMatrixMultiplyReshapedKernel(const CLGEMMLowpMatrixMultiplyReshapedKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMLowpMatrixMultiplyReshapedKernel &operator=(const CLGEMMLowpMatrixMultiplyReshapedKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpMatrixMultiplyReshapedKernel(CLGEMMLowpMatrixMultiplyReshapedKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpMatrixMultiplyReshapedKernel &operator=(CLGEMMLowpMatrixMultiplyReshapedKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input0    Input tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4.
+     * @param[in]  input1    Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
+     * @param[out] output    Output tensor to store the result of matrix multiplication. Data type supported: S32
+     * @param[in]  lhs_info  LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
+     *                       lhs_info.m0: 2,3,4,5,6,7,8
+     *                       lhs_info.k0: 2,3,4,8,16
+     *                       lhs_info.transpose: false
+     * @param[in]  rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
+     *                       rhs_info.n0: 2,3,4,8,16
+     *                       rhs_info.k0: same as lhs_info.k0
+     *                       rhs_info.transpose: true
+     * @param[in]  gemm_info GEMM information used to retrieve the original dimensions of the input matrices
+     *
+     * @note lhs_info.k0 must be equal to rhs_info.k0
+     */
+    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input0          Input tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4.
+     * @param[in]  input1          Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
+     * @param[out] output          Output tensor to store the result of matrix multiplication. Data type supported: S32
+     * @param[in]  lhs_info        LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
+     *                             lhs_info.m0: 2,3,4,5,6,7,8
+     *                             lhs_info.k0: 2,3,4,8,16
+     *                             lhs_info.transpose: false
+     * @param[in]  rhs_info        RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
+     *                             rhs_info.n0: 2,3,4,8,16
+     *                             rhs_info.k0: same as lhs_info.k0
+     *                             rhs_info.transpose: true
+     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
+     *
+     * @note lhs_info.k0 must be equal to rhs_info.k0
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMReshapeInfo &gemm_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyReshapedKernel
+     *
+     * @param[in] input0    Input tensor info containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4.
+     * @param[in] input1    Input tensor info containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
+     * @param[in] output    Output tensor info. Data type supported: S32
+     * @param[in] lhs_info  LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
+     *                      lhs_info.m0: 2,3,4,5,6,7,8
+     *                      lhs_info.k0: 2,3,4,8,16
+     *                      lhs_info.transpose: false
+     * @param[in] rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
+     *                      rhs_info.n0: 2,3,4,8,16
+     *                      rhs_info.k0: 2,3,4,8,16
+     *                      rhs_info.transpose: true
+     * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
+     *
+     * @note lhs_info.k0 must be equal to rhs_info.k0
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+                           const GEMMReshapeInfo &gemm_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input0;
+    const ICLTensor *_input1;
+    ICLTensor       *_output;
+    bool             _slide_matrix_b;
+    bool             _reinterpret_output_as_3d;
+    unsigned int     _k;
+    bool             _use_dummy_work_items;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H*/

diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
index 33fb903..95aa30d 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
@@ -195,11 +195,11 @@
 {
     ARM_COMPUTE_UNUSED(vector_sum_row, vector_sum_col, output_multipliers, bias, output_shifts);
 
-    const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-    bool          reinterpret_input_as_3d             = gemm_info.reinterpret_input_as_3d;
-    bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d != 0);
+    const GEMMLowpOutputStageInfo output_stage                        = gemm_info.output_stage;
+    unsigned int                 &num_elems_processed_per_iteration_x = num_elements_processed[0];
+    unsigned int                 &num_elems_processed_per_iteration_y = num_elements_processed[1];
+    bool                          reinterpret_input_as_3d             = gemm_info.reinterpret_input_as_3d;
+    bool                          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d != 0);
 
     Window win{};
     Window win_out{};
@@ -297,7 +297,7 @@
                                                   output_multipliers != nullptr ? output_multipliers->info() : nullptr,
                                                   output_shifts != nullptr ? output_shifts->info() : nullptr));
 
-    auto padding_info = get_padding_info({ input0, input1, output, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts });
+    auto                          padding_info = get_padding_info({ input0, input1, output, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts });
     const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
     const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
     const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
@@ -349,7 +349,7 @@
     // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
     // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m
     const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : output->info()->dimension(1);
-        // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
     const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
     const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
 

diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h
new file mode 100644
index 0000000..222a861
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h

@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
+#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to multiply matrices with QASYMM8 data type when only the input matrix RHS (input1) has been reshaped
+ *
+ * @note The input matrix input1 must be reshaped through @ref CLGEMMReshapeRHSMatrixKernel
+ * @note For fused output stage, only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT type is supported
+ */
+class CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel : public ICLKernel
+{
+public:
+    /** Default Constructor */
+    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel(const CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &operator=(const CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &operator=(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input0             Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  input1             Input tensor containing the RHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
+     * @param[out] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
+     * @param[in]  gemm_info          GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
+     *                                Only the following values are supported for LHS info:
+     *                                lhs_info.m0: 2,3,4,5,6,7,8
+     *                                lhs_info.k0: 2,3,4,8,16
+     *                                Only the following values are supported for RHS info:
+     *                                rhs_info.n0: 2,3,4,8,16
+     *                                rhs_info.k0: same as lhs_info.k0
+     *                                rhs_info.transpose: true
+     * @param[in]  vector_sum_col     (Optional) Input row-vector of sums of all the entries in each column of matrix B.
+     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
+     * @param[in]  vector_sum_row     (Optional) Input row-vector of sums of all the entries in each row of matrix A.
+     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
+     * @param[in]  bias               (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: S32.
+     * @param[in]  output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+     *                                Supported data types: S32.
+     * @param[in]  output_shifts      (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+     *                                Supported data types: S32.
+     */
+    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info, const ICLTensor *vector_sum_col = nullptr,
+                   const ICLTensor *vector_sum_row = nullptr, const ICLTensor *bias = nullptr, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context    The compile context to be used.
+     * @param[in]  input0             Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  input1             Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0
+     * @param[out] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
+     * @param[in]  gemm_info          GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
+     *                                Only the following values are supported for LHS info:
+     *                                lhs_info.m0: 2,3,4,5,6,7,8
+     *                                lhs_info.k0: 2,3,4,8,16
+     *                                Only the following values are supported for RHS info:
+     *                                rhs_info.n0: 2,3,4,8,16
+     *                                rhs_info.k0: same as lhs_info.k0
+     *                                rhs_info.transpose: true
+     * @param[in]  vector_sum_col     (Optional) Input row-vector of sums of all the entries in each column of matrix B.
+     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
+     * @param[in]  vector_sum_row     (Optional) Input row-vector of sums of all the entries in each row of matrix A.
+     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
+     * @param[in]  bias               (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: S32.
+     * @param[in]  output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+     *                                Supported data types: S32.
+     * @param[in]  output_shifts      (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+     *                                Supported data types: S32.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info, const ICLTensor *vector_sum_col = nullptr,
+                   const ICLTensor *vector_sum_row = nullptr, const ICLTensor *bias = nullptr, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
+     *
+     * @param[in] input0             Input tensor info for the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in] input1             Input tensor info for the RHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
+     * @param[in] output             Output tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
+     * @param[in] gemm_info          GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
+     *                               Only the following values are supported for LHS info:
+     *                               lhs_info.m0: 2,3,4,5,6,7,8
+     *                               lhs_info.k0: 2,3,4,8,16
+     *                               Only the following values are supported for RHS info:
+     *                               rhs_info.n0: 2,3,4,8,16
+     *                               rhs_info.k0: same as lhs_info.k0
+     *                               rhs_info.transpose: true
+     * @param[in] vector_sum_col     (Optional) Input row-vector info of sums of all the entries in each column of matrix B.
+     *                               Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
+     * @param[in] vector_sum_row     (Optional) Input row-vector info of sums of all the entries in each row of matrix A.
+     *                               Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
+     * @param[in] bias               (Optional) Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                               Biases are 1D tensor with dimensions [OFM]. Data type supported: S32.
+     * @param[in] output_multipliers (Optional) Output multipliers tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+     *                               Supported data types: S32.
+     * @param[in] output_shifts      (Optional) Output shifts tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+     *                               Supported data types: S32.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMKernelInfo &gemm_info, const ITensorInfo *vector_sum_col = nullptr,
+                           const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr, const ITensorInfo *output_multipliers = nullptr,
+                           const ITensorInfo *output_shifts = nullptr);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input0;
+    const ICLTensor *_input1;
+    ICLTensor       *_output;
+    const ICLTensor *_vector_sum_col;
+    const ICLTensor *_vector_sum_row;
+    const ICLTensor *_bias;
+    const ICLTensor *_output_multipliers;
+    const ICLTensor *_output_shifts;
+    bool             _slide_matrix_b;
+    bool             _reinterpret_input_as_3d;
+    bool             _reinterpret_output_as_3d;
+    bool             _use_dummy_work_items;
+    bool             _is_quantized_per_channel;
+    bool             _fuse_output_stage;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H */
\ No newline at end of file

diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
index aa4eea6..c7844b9 100644
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"

diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
new file mode 100644
index 0000000..f870559
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h

@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
+#define ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel used to add the offset contribution after the matrix multiplication. The computation is performed in-place
+ *
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication),
+ * and adds to it the offset contribution of matrix A and matrix B in-place.
+ *
+ * The final result is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (vector_sum_col[k] * a_offset) +
+ *                   (vector_sum_row[i] * b_offset) +
+ *                   (a_offset * b_offset * k)
+ *
+ */
+class CLGEMMLowpOffsetContributionKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLGEMMLowpOffsetContributionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    CLGEMMLowpOffsetContributionKernel(const CLGEMMLowpOffsetContributionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    CLGEMMLowpOffsetContributionKernel &operator=(const CLGEMMLowpOffsetContributionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpOffsetContributionKernel(CLGEMMLowpOffsetContributionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpOffsetContributionKernel &operator=(CLGEMMLowpOffsetContributionKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in, out] mm_result      Input tensor containing the result of the matrix multiplication. Data type supported: S32
+     * @param[in]      vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
+     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]      vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
+     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]      bias           Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[in]      k              Number of matrix A columns or Matrix B rows
+     * @param[in]      a_offset       Offset to be added to each element of the matrix A.
+     * @param[in]      b_offset       Offset to be added to each element of the matrix B.
+     */
+    void configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset, int32_t b_offset);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] mm_result       Input tensor containing the result of the matrix multiplication. Data type supported: S32
+     * @param[in]      vector_sum_col  Input row-vector of sums of all the entries in each column of matrix B.
+     *                                 Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]      vector_sum_row  Input row-vector of sums of all the entries in each row of matrix A.
+     *                                 Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]      bias            Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                                 Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[in]      k               Number of matrix A columns or Matrix B rows
+     * @param[in]      a_offset        Offset to be added to each element of the matrix A.
+     * @param[in]      b_offset        Offset to be added to each element of the matrix B.
+     */
+    void configure(const CLCompileContext &compile_context, ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset,
+                   int32_t b_offset);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpOffsetContributionKernel
+     *
+     * @param[in] mm_result      Input tensor containing the result of @ref CLGEMMLowpOffsetContributionKernel. Data type supported: S32
+     * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
+     *                           Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+     * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
+     *                           Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
+     * @param[in] bias           Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                           Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[in] a_offset       Offset to be added to each element of the matrix A.
+     * @param[in] b_offset       Offset to be added to each element of the matrix B.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_vector_sum_col;
+    const ICLTensor *_vector_sum_row;
+    ICLTensor       *_mm_result;
+    const ICLTensor *_bias;
+};
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H */

diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
index afa7bdb..b41d870 100644
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
new file mode 100644
index 0000000..15f54d1
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h

@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
+#define ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel used to add the offset contribution after the matrix multiplication and perform the output stage.
+ *
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), adds to it the offset contribution
+ * of matrix A and matrix B and performs the output stage defined by the output_stage argument
+ *
+ * @note For quantized computations the output data type for auto-initialization must be passed as part of the @ref GEMMLowpOutputStageInfo.
+ */
+class CLGEMMLowpOffsetContributionOutputStageKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLGEMMLowpOffsetContributionOutputStageKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    CLGEMMLowpOffsetContributionOutputStageKernel(const CLGEMMLowpOffsetContributionOutputStageKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    CLGEMMLowpOffsetContributionOutputStageKernel &operator=(const CLGEMMLowpOffsetContributionOutputStageKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpOffsetContributionOutputStageKernel(CLGEMMLowpOffsetContributionOutputStageKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpOffsetContributionOutputStageKernel &operator=(CLGEMMLowpOffsetContributionOutputStageKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  mm_result          Input tensor containing the result of the matrix multiplication. Data type supported: S32
+     * @param[in]  vector_sum_col     Input row-vector of sums of all the entries in each column of matrix B.
+     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]  vector_sum_row     Input row-vector of sums of all the entries in each row of matrix A.
+     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]  bias               Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  k                  Number of matrix A columns or Matrix B rows
+     * @param[in]  a_offset           Offset to be added to each element of the matrix A.
+     * @param[in]  b_offset           Offset to be added to each element of the matrix B.
+     * @param[in]  output_stage       GEMMLowp output stage info
+     * @param[in]  output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+     *                                Supported data types: S32
+     * @param[in]  output_shifts      Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+     *                                Supported data types: S32
+     */
+    void configure(const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output, int32_t k, int32_t a_offset, int32_t b_offset,
+                   const GEMMLowpOutputStageInfo &output_stage, const ICLTensor *output_multipliers, const ICLTensor *output_shifts);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context    The compile context to be used.
+     * @param[in]  mm_result          Input tensor containing the result of the matrix multiplication. Data type supported: S32
+     * @param[in]  vector_sum_col     Input row-vector of sums of all the entries in each column of matrix B.
+     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]  vector_sum_row     Input row-vector of sums of all the entries in each row of matrix A.
+     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]  bias               Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  k                  Number of matrix A columns or Matrix B rows
+     * @param[in]  a_offset           Offset to be added to each element of the matrix A.
+     * @param[in]  b_offset           Offset to be added to each element of the matrix B.
+     * @param[in]  output_stage       GEMMLowp output stage info
+     * @param[in]  output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+     *                                Supported data types: S32
+     * @param[in]  output_shifts      Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+     *                                Supported data types: S32
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output,
+                   int32_t k,
+                   int32_t a_offset, int32_t b_offset,
+                   const GEMMLowpOutputStageInfo &output_stage, const ICLTensor *output_multipliers, const ICLTensor *output_shifts);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpOffsetContributionKernel
+     *
+     * @param[in] mm_result          Input tensor containing the result of @ref CLGEMMLowpOffsetContributionKernel. Data type supported: S32
+     * @param[in] vector_sum_col     Input row-vector of sums of all the entries in each column of matrix B.
+     *                               Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+     * @param[in] vector_sum_row     Input row-vector of sums of all the entries in each row of matrix A.
+     *                               Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
+     * @param[in] bias               Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                               Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[in] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED.
+     * @param[in] a_offset           Offset to be added to each element of the matrix A.
+     * @param[in] b_offset           Offset to be added to each element of the matrix B.
+     * @param[in] output_stage       GEMMLowp output stage info
+     * @param[in] output_multipliers Output multipliers tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+     *                               Supported data types: S32
+     * @param[in] output_shifts      Output shifts tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+     *                               Supported data types: S32
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, int32_t a_offset,
+                           int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_mm_result;
+    const ICLTensor *_vector_sum_col;
+    const ICLTensor *_vector_sum_row;
+    const ICLTensor *_bias;
+    ICLTensor       *_output;
+    const ICLTensor *_output_multipliers;
+    const ICLTensor *_output_shifts;
+    bool             _is_quantized_per_channel;
+};
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H */

diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
index ab1b5a2..d0f0168 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h
new file mode 100644
index 0000000..8653102
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h

@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFIXEDPOINTKERNEL_H
+#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFIXEDPOINTKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED/QSYMM16
+ *
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final quantized value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by gemmlowp_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the proper quantized range and cast to QASYMM8/QASYMM8_SIGNED/QSYMM16.
+ */
+class CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel(const CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel(CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor. Data type supported: S32
+     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16.
+     * @param[in]  info            Output stage info. Used to pass the quantized output data type
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
+     *
+     * @param[in] input  Input tensor. Data type supported: S32
+     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[in] output Output tensor. Data type supported: Data type supported: QSYMM8/QASYMM8_SIGNED/QSYMM16.
+     * @param[in] info   Output stage info. Used to pass the quantized output data type
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    const ICLTensor *_bias;
+    ICLTensor       *_output;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFIXEDPOINTKERNEL_H */

diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
index ad5bac0..1d29dfe 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
new file mode 100644
index 0000000..0a8d5e1
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h

@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H
+#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ICLTensor;
+
+/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Requantize
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ */
+class CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel(const CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel(CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor. Data type supported: S32
+     * @param[in]  bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                    Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  info   Output stage info. Used to pass the quantized output data type
+     */
+    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor. Data type supported: S32
+     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  info            Output stage info. Used to pass the quantized output data type
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
+     *
+     * @param[in] input  Input tensor. Data type supported: S32
+     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in] info   Output stage info. Used to pass the quantized output data type
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    const ICLTensor *_bias;
+    ICLTensor       *_output;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H */

diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp
index 8e4b291..d32d328 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
new file mode 100644
index 0000000..abdf33e
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h

@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H
+#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values:
+ *  -#  -to the [0..255] range and cast to QASYMM8.
+ *  -#  -to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ */
+class CLGEMMLowpQuantizeDownInt32ScaleKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLGEMMLowpQuantizeDownInt32ScaleKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    CLGEMMLowpQuantizeDownInt32ScaleKernel(const CLGEMMLowpQuantizeDownInt32ScaleKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    CLGEMMLowpQuantizeDownInt32ScaleKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpQuantizeDownInt32ScaleKernel(CLGEMMLowpQuantizeDownInt32ScaleKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpQuantizeDownInt32ScaleKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input        Input tensor. Data type supported: S32
+     * @param[in]  bias         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output       Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  output_stage GEMMLowp output stage metadata.
+     */
+    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor. Data type supported: S32
+     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  output_stage    GEMMLowp output stage metadata.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleKernel
+     *
+     * @param[in] input        Input tensor. Data type supported: S32
+     * @param[in] bias         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                         Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[in] output       Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in] output_stage GEMMLowp output stage metadata.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    const ICLTensor *_bias;
+    ICLTensor       *_output;
+};
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H */
\ No newline at end of file

diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
index 339049f..d508bf6 100644
--- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.h b/src/core/CL/kernels/CLGEMMLowpReductionKernel.h
new file mode 100644
index 0000000..237d809
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpReductionKernel.h

@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H
+#define ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+struct GEMMLowpReductionKernelInfo;
+
+/** Common interface for all OpenCL reduction kernels */
+class ICLGEMMLowpReductionKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    ICLGEMMLowpReductionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    ICLGEMMLowpReductionKernel(const ICLGEMMLowpReductionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    ICLGEMMLowpReductionKernel &operator=(const ICLGEMMLowpReductionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    ICLGEMMLowpReductionKernel(ICLGEMMLowpReductionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    ICLGEMMLowpReductionKernel &operator=(ICLGEMMLowpReductionKernel &&) = default;
+
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
+     * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
+     * @param[in]  info   Kernel metadata:
+     *                    - k            Number of matrix columns/rows depending on the type of reduction.
+     *                    - is_reshaped  True if the matrix has been reshaped.
+     *                    - scalar       Scalar value to multiply each reduced column/row by.
+     *                    - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
+     */
+    virtual void configure(const ICLTensor *input, ICLTensor *output, const GEMMLowpReductionKernelInfo &info) = 0;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
+     * @param[out] output          Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
+     * @param[in]  info            Kernel metadata:
+     *                             - k            Number of matrix columns/rows depending on the type of reduction.
+     *                             - is_reshaped  True if the matrix has been reshaped.
+     *                             - scalar       Scalar value to multiply each reduced column/row by.
+     *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
+     */
+    virtual void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMLowpReductionKernelInfo &info) = 0;
+
+protected:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+};
+
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ */
+class CLGEMMLowpMatrixAReductionKernel : public ICLGEMMLowpReductionKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
+     * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
+     * @param[in]  info           Kernel metadata:
+     *                            - k            Number of matrix columns/rows depending on the type of reduction.
+     *                            - is_reshaped  True if the matrix has been reshaped.
+     *                            - scalar       Scalar value to multiply each reduced column/row by.
+     *                            - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
+     */
+    void configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  mtx_a           Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
+     * @param[out] vector_sum_row  Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
+     * @param[in]  info            Kernel metadata:
+     *                             - k            Number of matrix columns/rows depending on the type of reduction.
+     *                             - is_reshaped  True if the matrix has been reshaped.
+     *                             - scalar       Scalar value to multiply each reduced column/row by.
+     *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixAReductionKernel
+     *
+     * @param[in] mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
+     * @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
+     * @param[in] info           Kernel metadata:
+     *                           - k            Number of matrix columns/rows depending on the type of reduction.
+     *                           - is_reshaped  True if the matrix has been reshaped.
+     *                           - scalar       Scalar value to multiply each reduced column/row by.
+     *                           - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+};
+
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ */
+class CLGEMMLowpMatrixBReductionKernel : public ICLGEMMLowpReductionKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
+     * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
+     * @param[in]  info           Kernel metadata:
+     *                            - k            Number of matrix columns/rows depending on the type of reduction.
+     *                            - is_reshaped  True if the matrix has been reshaped.
+     *                            - scalar       Scalar value to multiply each reduced column/row by.
+     *                            - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
+     */
+    void configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  mtx_b           Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
+     * @param[out] vector_sum_col  Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
+     * @param[in]  info            Kernel metadata:
+     *                             - k            Number of matrix columns/rows depending on the type of reduction.
+     *                             - is_reshaped  True if the matrix has been reshaped.
+     *                             - scalar       Scalar value to multiply each reduced column/row by.
+     *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixBReductionKernel
+     *
+     * @param[in] mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
+     * @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
+     * @param[in] info           Kernel metadata:
+     *                           - k            Number of matrix columns/rows depending on the type of reduction.
+     *                           - is_reshaped  True if the matrix has been reshaped.
+     *                           - scalar       Scalar value to multiply each reduced column/row by.
+     *                           - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H */

diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index fd09782..b0d08a7 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
new file mode 100644
index 0000000..71d223b
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h

@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMMATRIXMULTIPLYKERNEL_H
+#define ARM_COMPUTE_CLGEMMMATRIXMULTIPLYKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to multiply two input matrices "A" and "B" and add a martix "C" if provided. All elements of the output matrix will be multiplied by alpha. In case matrix C is passed, it will be added to the previous result.
+ *  For the matrix C, the broadcast addition is supported if the flag "broadcast_bias" is set in the GEMMReshapeInfo object
+ *
+ * @note If the input tensors @p input0 and @p input1 have been reshaped respectively with @ref CLGEMMReshapeLHSMatrixKernel" and @ref CLGEMMReshapeRHSMatrixKernel,
+ *       the flag @p is_interleaved_transposed must be set to true
+ *
+ * @attention @p input1 tensor must have at least 2 dimensions (matrix)
+ *
+ */
+class CLGEMMMatrixMultiplyKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLGEMMMatrixMultiplyKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMMatrixMultiplyKernel(const CLGEMMMatrixMultiplyKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMMatrixMultiplyKernel &operator=(const CLGEMMMatrixMultiplyKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMMatrixMultiplyKernel(CLGEMMMatrixMultiplyKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMMatrixMultiplyKernel &operator=(CLGEMMMatrixMultiplyKernel &&) = default;
+    /** Initialise the kernel's input, output and alpha
+     *
+     * @param[in]  input0                    Input tensor containing the Matrix A. Data types supported: F16/F32
+     * @param[in]  input1                    Input tensor containing the Matrix B. Data type supported: same as @p input0
+     * @param[in]  input2                    Input tensor containing the Matrix C (bias). Can be nullptr. Data type supported: same as @p input0
+     * @param[out] output                    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
+     * @param[in]  alpha                     Weight of the matrix product
+     * @param[in]  beta                      (Optional) Weight of vector C. Default value is 0. Only beta = 1 is currently supported.
+     * @param[in]  is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMReshapeLHSMatrixKernel and @ref CLGEMMReshapeRHSMatrixKernel
+     * @param[in]  reshape_info              (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
+     * @param[in]  fp_mixed_precision        (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy
+     * @param[in]  activation_info           (Optional) Activation to apply after the matrix multiplication
+     *
+     */
+    void configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta = 0.f,
+                   bool is_interleaved_transposed = true, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo(), bool fp_mixed_precision = false, const ActivationLayerInfo &activation_info = ActivationLayerInfo());
+    /** Initialise the kernel's input, output and alpha
+     *
+     * @param[in]  compile_context           The compile context to be used.
+     * @param[in]  input0                    Input tensor containing the Matrix A. Data types supported: F16/F32
+     * @param[in]  input1                    Input tensor containing the Matrix B. Data type supported: same as @p input0
+     * @param[in]  input2                    Input tensor containing the Matrix C (bias). Can be nullptr. Data type supported: same as @p input0
+     * @param[out] output                    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
+     * @param[in]  alpha                     Weight of the matrix product
+     * @param[in]  beta                      (Optional) Weight of vector C. Default value is 0. Only beta = 1 is currently supported.
+     * @param[in]  is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMReshapeLHSMatrixKernel and @ref CLGEMMReshapeRHSMatrixKernel
+     * @param[in]  reshape_info              (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
+     * @param[in]  fp_mixed_precision        (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy
+     * @param[in]  activation_info           (Optional) Activation to apply after the matrix multiplication
+     *
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta = 0.f,
+                   bool is_interleaved_transposed = true, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo(), bool fp_mixed_precision = false, const ActivationLayerInfo &activation_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyKernel
+     *
+     * @param[in] input0                    Input tensor containing the Matrix A info. Data types supported: F16/F32
+     * @param[in] input1                    Input tensor containing the Matrix B info. Data type supported: same as @p input0
+     * @param[in] input2                    Input tensor containing the Matrix C (bias) info. Can be nullptr. Data type supported: same as @p input0
+     * @param[in] output                    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
+     * @param[in] alpha                     Weight of the matrix product
+     * @param[in] beta                      Weight of vector C. Default value is 0. Only beta = 1 is currently supported.
+     * @param[in] is_interleaved_transposed True if input0 and input1 have been reshaped respectively using @ref CLGEMMReshapeLHSMatrixKernel and @ref CLGEMMReshapeRHSMatrixKernel
+     * @param[in] reshape_info              GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
+     * @param[in] gpu_target                GPU Target
+     * @param[in] fp_mixed_precision        (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy
+     * @param[in] activation_info           (Optional) Activation to apply after the matrix multiplication
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,
+                           bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision = false, const ActivationLayerInfo &activation_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+public:
+    const ICLTensor *_input0;
+    const ICLTensor *_input1;
+    const ICLTensor *_input2;
+    ICLTensor       *_output;
+    bool             _slide_matrix_b;
+    bool             _reinterpret_input_as_3d;
+    bool             _reinterpret_output_as_3d;
+    bool             _add_bias;
+    bool             _broadcast_bias;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLGEMMMATRIXMULTIPLYKERNEL_H */

diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp
index cea147b..f613937 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h b/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h
new file mode 100644
index 0000000..6b6004b
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h

@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
+#define ARM_COMPUTE_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to multiply matrices when neither of the input matrices have been reshaped */
+class CLGEMMMatrixMultiplyNativeKernel : public ICLKernel
+{
+public:
+    /** Default Constructor */
+    CLGEMMMatrixMultiplyNativeKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMMatrixMultiplyNativeKernel(const CLGEMMMatrixMultiplyNativeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMMatrixMultiplyNativeKernel &operator=(const CLGEMMMatrixMultiplyNativeKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMMatrixMultiplyNativeKernel(CLGEMMMatrixMultiplyNativeKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMMatrixMultiplyNativeKernel &operator=(CLGEMMMatrixMultiplyNativeKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input0    Input tensor for the LHS matrix. Data type supported: F32. The number of dimensions for the LHS matrix must be less or equal than 4.
+     * @param[in]  input1    Input tensor for the RHS matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
+     * @param[in]  input2    Input tensor containing the bias matrix. Data type supported: same as @p input0.
+     * @param[out] output    Output tensor info. Data type supported: same as @p input0
+     * @param[in]  alpha     Weight of the matrix product
+     * @param[in]  beta      Weight of the matrix bias
+     * @param[in]  lhs_info  LHS matrix information used to retrieve the number of rows and accumulations to be processed by each thread. Only the following values are supported:
+     *                       lhs_info.m0: 1,2,3,4,5,6,7,8
+     *                       lhs_info.k0: 2,3,4,8,16
+     * @param[in]  rhs_info  RHS matrix information used to retrieve the number of columns and accumulations to be processed by each thread. Only the following values are supported:
+     *                       rhs_info.n0: 2,3,4,8,16
+     *                       rhs_info.k0: same of lhs_info.k0
+     * @param[in]  gemm_info GEMM information used to retrieve the original dimensions of the input matrices
+     */
+    void configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMKernelInfo    &gemm_info);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input0          Input tensor for the LHS matrix. Data type supported: F32. The number of dimensions for the LHS matrix must be less or equal than 4.
+     * @param[in]  input1          Input tensor for the RHS matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
+     * @param[in]  input2          Input tensor containing the bias matrix. Data type supported: same as @p input0.
+     * @param[out] output          Output tensor info. Data type supported: same as @p input0
+     * @param[in]  alpha           Weight of the matrix product
+     * @param[in]  beta            Weight of the matrix bias
+     * @param[in]  lhs_info        LHS matrix information used to retrieve the number of rows and accumulations to be processed by each thread. Only the following values are supported:
+     *                             lhs_info.m0: 1,2,3,4,5,6,7,8
+     *                             lhs_info.k0: 2,3,4,8,16
+     * @param[in]  rhs_info        RHS matrix information used to retrieve the number of columns and accumulations to be processed by each thread. Only the following values are supported:
+     *                             rhs_info.n0: 2,3,4,8,16
+     *                             rhs_info.k0: same of lhs_info.k0
+     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMKernelInfo    &gemm_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyNativeKernel
+     *
+     * @param[in] input0    Input tensor info for the LHS matrix. Data type supported: F32. The number of dimensions for the LHS matrix must be less or equal than 4.
+     * @param[in] input1    Input tensor info for the RHS matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
+     * @param[in] input2    Input tensor info containing the bias matrix. Data type supported: same as @p input0.
+     * @param[in] output    Output tensor info. Data type supported: same as @p input0
+     * @param[in] alpha     Weight of the matrix product
+     * @param[in] beta      Weight of the matrix bias
+     * @param[in] lhs_info  LHS matrix information used to retrieve the number of rows and accumulations to be processed by each thread. Only the following values are supported:
+     *                      lhs_info.m0: 1,2,3,4,5,6,7,8
+     *                      lhs_info.k0: 2,3,4,8,16
+     * @param[in] rhs_info  RHS matrix information used to retrieve the number of columns and accumulations to be processed by each thread. Only the following values are supported:
+     *                      rhs_info.n0: 2,3,4,8,16
+     *                      rhs_info.k0: same of lhs_info.k0
+     * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+                           const GEMMRHSMatrixInfo &rhs_info,
+                           const GEMMKernelInfo    &gemm_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input0;
+    const ICLTensor *_input1;
+    const ICLTensor *_input2;
+    ICLTensor       *_output;
+    bool             _slide_matrix_b;
+    bool             _reinterpret_input_as_3d;
+    bool             _reinterpret_output_as_3d;
+    bool             _use_dummy_work_items;
+    bool             _add_bias;
+    bool             _broadcast_bias;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H*/

diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
index eaf5708..fb15b42 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h
new file mode 100644
index 0000000..2ffc322
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h

@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
+#define ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to multiply matrices when both the input matrices LHS (input0) and RHS (input1) have been reshaped
+ *
+ * @note The input matrices @p input0 and @p input1 must be reshaped through @ref CLGEMMReshapeLHSMatrixKernel and  @ref CLGEMMReshapeRHSMatrixKernel
+ */
+class CLGEMMMatrixMultiplyReshapedKernel : public ICLKernel
+{
+public:
+    /** Default Constructor */
+    CLGEMMMatrixMultiplyReshapedKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMMatrixMultiplyReshapedKernel(const CLGEMMMatrixMultiplyReshapedKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMMatrixMultiplyReshapedKernel &operator=(const CLGEMMMatrixMultiplyReshapedKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMMatrixMultiplyReshapedKernel(CLGEMMMatrixMultiplyReshapedKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMMatrixMultiplyReshapedKernel &operator=(CLGEMMMatrixMultiplyReshapedKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @note The F16 computation also supports mixed precision through the gemm_info.fp_mixed_precision flag.
+     *       Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the
+     *       multiplications. i.e. float c = (half)a * (half)b
+     *
+     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
+     *       Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
+     *       the following conditions are required:
+     *       -# rhs_info.n0 can only be 4, 8 and 16
+     *       -# rhs_info.k0 can only be 4, 8 and 16
+     *       -# Data type can only be F32
+     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+     *       -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
+     *       -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+     *       -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+     *
+     * @param[in]  input0    Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4
+     * @param[in]  input1    Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3
+     * @param[in]  input2    Input tensor containing the bias matrix. Data type supported: same as @p input0.
+     * @param[out] output    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
+     * @param[in]  alpha     Weight of the matrix product
+     * @param[in]  beta      Weight of the matrix bias
+     * @param[in]  lhs_info  LHS matrix information used for reshaping the input0 tensor. Only the following values are supported:
+     *                       lhs_info.m0: 2,3,4,5,6,7,8
+     *                       lhs_info.k0: 2,3,4,8,16
+     *                       lhs_info.transpose: false
+     * @param[in]  rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
+     *                       rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
+     *                       rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
+     *                       rhs_info.transpose: true
+     * @param[in]  gemm_info GEMM information used to retrieve the original dimensions of the input matrices
+     *
+     * @note lhs_info.k0 must be equal to rhs_info.k0
+     */
+    void configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMKernelInfo    &gemm_info);
+    /** Initialise the kernel's input and output.
+     *
+     * @note The F16 computation also supports mixed precision through the gemm_info.fp_mixed_precision flag.
+     *       Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the
+     *       multiplications. i.e. float c = (half)a * (half)b
+     *
+     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
+     *       Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
+     *       the following conditions are required:
+     *       -# rhs_info.n0 can only be 4, 8 and 16
+     *       -# rhs_info.k0 can only be 4, 8 and 16
+     *       -# Data type can only be F32
+     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+     *       -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
+     *       -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+     *       -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input0          Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32  (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4
+     * @param[in]  input1          Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3
+     * @param[in]  input2          Input tensor containing the bias matrix. Data type supported: same as @p input0.
+     * @param[out] output          Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
+     * @param[in]  alpha           Weight of the matrix product
+     * @param[in]  beta            Weight of the matrix bias
+     * @param[in]  lhs_info        LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
+     *                             lhs_info.m0: 2,3,4,5,6,7,8
+     *                             lhs_info.k0: 2,3,4,8,16
+     *                             lhs_info.transpose: false
+     * @param[in]  rhs_info        RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
+     *                             rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
+     *                             rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
+     *                             rhs_info.transpose: true
+     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
+     *
+     * @note lhs_info.k0 must be equal to rhs_info.k0
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMKernelInfo    &gemm_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyReshapedKernel
+     *
+     * @note The F16 computation also supports mixed precision through the gemm_info.fp_mixed_precision flag.
+     *       Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the
+     *       multiplications. i.e. float c = (half)a * (half)b
+     *
+     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
+     *       Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
+     *       the following conditions are required:
+     *       -# rhs_info.n0 can only be 4, 8 and 16
+     *       -# rhs_info.k0 can only be 4, 8 and 16
+     *       -# Data type can only be F32
+     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+     *       -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
+     *       -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+     *       -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+     *
+     * @param[in] input0    Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32  (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4
+     * @param[in] input1    Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3
+     * @param[in] input2    Input tensor info containing the bias matrix. Data type supported: same as @p input0.
+     * @param[in] output    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
+     * @param[in] alpha     Weight of the matrix product
+     * @param[in] beta      Weight of the matrix bias
+     * @param[in] lhs_info  LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
+     *                      lhs_info.m0: 2,3,4,5,6,7,8
+     *                      lhs_info.k0: 2,3,4,8,16
+     *                      lhs_info.transpose: false
+     * @param[in] rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
+     *                      rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
+     *                      rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
+     *                      rhs_info.transpose: true
+     * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
+     *
+     * @note lhs_info.k0 must be equal to rhs_info.k0
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+                           const GEMMRHSMatrixInfo &rhs_info,
+                           const GEMMKernelInfo    &gemm_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input0;
+    const ICLTensor *_input1;
+    const ICLTensor *_input2;
+    ICLTensor       *_output;
+    bool             _slide_matrix_b;
+    bool             _reinterpret_output_as_3d;
+    bool             _use_dummy_work_items;
+    bool             _add_bias;
+    bool             _broadcast_bias;
+    bool             _export_to_cl_image;
+    unsigned int     _k;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H*/
\ No newline at end of file

diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
index d53aede..1f296f8 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
@@ -356,10 +356,10 @@
         ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
     }
 
-    const size_t lhs_idx_batch_size = _reinterpret_input_as_3d && !_has_pad_y? 3u : 2u;
+    const size_t lhs_idx_batch_size = _reinterpret_input_as_3d && !_has_pad_y ? 3u : 2u;
     const size_t rhs_idx_batch_size = 2u;
     const size_t bia_idx_batch_size = 2u;
-    const size_t out_idx_batch_size = _reinterpret_output_as_3d && !_has_pad_y? 3u : 2u;
+    const size_t out_idx_batch_size = _reinterpret_output_as_3d && !_has_pad_y ? 3u : 2u;
 
     Window slice          = window.first_slice_window_3D();
     Window slice_matrix_b = slice;

diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h
new file mode 100644
index 0000000..5b96679
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h

@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
+#define ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to multiply matrices when only the input matrix RHS (input1) has been reshaped
+ *
+ * @note The input matrix input1 must be reshaped through @ref CLGEMMReshapeRHSMatrixKernel
+ */
+class CLGEMMMatrixMultiplyReshapedOnlyRHSKernel : public ICLKernel
+{
+public:
+    /** Default Constructor */
+    CLGEMMMatrixMultiplyReshapedOnlyRHSKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMMatrixMultiplyReshapedOnlyRHSKernel(const CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &operator=(const CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMMatrixMultiplyReshapedOnlyRHSKernel(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &operator=(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
+     *       Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
+     *       the following conditions are required:
+     *       -# rhs_info.n0 can only be 4, 8 and 16
+     *       -# rhs_info.k0 can only be 4, 8 and 16
+     *       -# Data type can only be F32
+     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+     *       -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
+     *       -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+     *       -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+     *
+     * @param[in]  input0    Input tensor containing the LHS matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true).
+     *                       The number of dimensions for the LHS matrix must be less or equal than 4.
+     * @param[in]  input1    Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
+     * @param[in]  input2    Input tensor containing the bias matrix. Data type supported: same as @p input0.
+     * @param[out] output    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
+     * @param[in]  alpha     Weight of the matrix product
+     * @param[in]  beta      Weight of the matrix bias
+     * @param[in]  lhs_info  LHS matrix information used to retrieve the number of rows to be processed by each thread. Only the following values are supported:
+     *                       lhs_info.m0: 1,2,3,4,5,6,7,8
+     * @param[in]  rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
+     *                       rhs_info.k0: 2,3,4,8,16
+     *                       rhs_info.n0: 2,3,4,8,16
+     *                       rhs_info.transpose: true,false
+     * @param[in]  gemm_info GEMM information used to retrieve the original dimensions of the input matrices
+     */
+    void configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMKernelInfo    &gemm_info);
+    /** Initialise the kernel's input and output.
+     *
+     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
+     *       Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
+     *       the following conditions are required:
+     *       -# rhs_info.n0 can only be 4, 8 and 16
+     *       -# rhs_info.k0 can only be 4, 8 and 16
+     *       -# Data type can only be F32
+     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+     *       -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
+     *       -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+     *       -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input0          Input tensor containing the LHS matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true).
+     *                             The number of dimensions for the LHS matrix must be less or equal than 4.
+     * @param[in]  input1          Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
+     * @param[in]  input2          Input tensor containing the bias matrix. Data type supported: same as @p input0.
+     * @param[out] output          Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
+     * @param[in]  alpha           Weight of the matrix product
+     * @param[in]  beta            Weight of the matrix bias
+     * @param[in]  lhs_info        LHS matrix information used to retrieve the number of rows to be processed by each thread. Only the following values are supported:
+     *                             lhs_info.m0: 1,2,3,4,5,6,7,8
+     * @param[in]  rhs_info        RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
+     *                             rhs_info.k0: 2,3,4,8,16
+     *                             rhs_info.n0: 2,3,4,8,16
+     *                             rhs_info.transpose: true,false
+     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMKernelInfo    &gemm_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
+     *
+     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
+     *       Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
+     *       the following conditions are required:
+     *       -# rhs_info.n0 can only be 4, 8 and 16
+     *       -# rhs_info.k0 can only be 4, 8 and 16
+     *       -# Data type can only be F32
+     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+     *       -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
+     *       -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+     *       -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+     *
+     * @param[in] input0    Input tensor info for the LHS matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true).
+     *                      The number of dimensions for the LHS matrix must be less or equal than 4.
+     * @param[in] input1    Input tensor info for the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
+     * @param[in] input2    Input tensor info containing the bias matrix. Data type supported: same as @p input0.
+     * @param[in] output    Output tensor info. Data type supported: same as @p input0
+     * @param[in] alpha     Weight of the matrix product
+     * @param[in] beta      Weight of the matrix bias
+     * @param[in] lhs_info  LHS matrix information used to retrieve the number of rows to be processed by each thread. Only the following values are supported:
+     *                      lhs_info.m0: 1,2,3,4,5,6,7,8
+     * @param[in] rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
+     *                      rhs_info.k0: 2,3,4,8,16
+     *                      rhs_info.n0: 2,3,4,8,16
+     *                      rhs_info.transpose: true,false
+     * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+                           const GEMMRHSMatrixInfo &rhs_info,
+                           const GEMMKernelInfo    &gemm_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input0;
+    const ICLTensor *_input1;
+    const ICLTensor *_input2;
+    ICLTensor       *_output;
+    bool             _slide_matrix_b;
+    bool             _reinterpret_input_as_3d;
+    bool             _reinterpret_output_as_3d;
+    bool             _use_dummy_work_items;
+    bool             _add_bias;
+    bool             _broadcast_bias;
+    bool             _export_to_cl_image;
+    bool             _has_pad_y;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H*/

diff --git a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
index 04aa061..ee0abc5 100644
--- a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h
new file mode 100644
index 0000000..bef8c23
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h

@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMMATRIXVECTORMULTIPLYKERNEL_H
+#define ARM_COMPUTE_CLGEMMMATRIXVECTORMULTIPLYKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the GEMM matrix vector multiply kernel. **/
+class CLGEMMMatrixVectorMultiplyKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLGEMMMatrixVectorMultiplyKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMMatrixVectorMultiplyKernel(const CLGEMMMatrixVectorMultiplyKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMMatrixVectorMultiplyKernel &operator=(const CLGEMMMatrixVectorMultiplyKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMMatrixVectorMultiplyKernel(CLGEMMMatrixVectorMultiplyKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMMatrixVectorMultiplyKernel &operator=(CLGEMMMatrixVectorMultiplyKernel &&) = default;
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input0 The reshaped input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[in]  input1 The 2D reshaped weights tensor. Data type supported: Same as @p input.
+     * @param[out] output The output 2D tensor. Data types supported: Same as @p input, S32 for QASYMM8/QASYMM8_SIGNED.
+     */
+    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output);
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input0          The reshaped input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[in]  input1          The 2D reshaped weights tensor. Data type supported: Same as @p input.
+     * @param[out] output          The output 2D tensor. Data types supported: Same as @p input, S32 for QASYMM8/QASYMM8_SIGNED.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixVectorMultiplyKernel
+     *
+     * @param[in] input0 The reshaped input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[in] input1 The 2D reshaped weights tensor info. Data type supported: Same as @p input.
+     * @param[in] output The output 2D tensor info. Data types supported: Same as @p input, S32 for QASYMM8/QASYMM8_SIGNED.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input0;
+    const ICLTensor *_input1;
+    ICLTensor       *_output;
+    int              _num_rows_read_per_iteration;
+    BorderSize       _border_size;
+};
+} // arm_compute
+#endif /*ARM_COMPUTE_CLGEMMMATRIXVECTORMULTIPLYKERNEL_H */

diff --git a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp
index f2ad677..3e2fc79 100644
--- a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h
new file mode 100644
index 0000000..92202a2
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h

@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMRESHAPELHSMATRIXKERNEL_H
+#define ARM_COMPUTE_CLGEMMRESHAPELHSMATRIXKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to reshape the LHS matrix when performing the matrix multiplication.
+ *  In particular, this function splits the input matrix in blocks of size M0xK0 (defined through GEMMLHSInfo) and
+ *  stores each one in the output matrix unrolling the values
+ */
+class CLGEMMReshapeLHSMatrixKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLGEMMReshapeLHSMatrixKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMReshapeLHSMatrixKernel(const CLGEMMReshapeLHSMatrixKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMReshapeLHSMatrixKernel &operator=(const CLGEMMReshapeLHSMatrixKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMReshapeLHSMatrixKernel(CLGEMMReshapeLHSMatrixKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMReshapeLHSMatrixKernel &operator=(CLGEMMReshapeLHSMatrixKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input                   Input tensor. Data types supported: All
+     * @param[out] output                  Output tensor. Data type supported: same as @p input
+     * @param[in]  lhs_info                LHS matrix information to be used for reshaping. This object contains all the necessary
+     *                                     information to reshape the input tensor. Only the following values are supported:
+     *                                     lhs_info.m0: 2,3,4,5,6,7,8
+     *                                     lhs_info.k0: 2,3,4,8,16
+     *                                     lhs_info.v0: greater than 0
+     *                                     lhs_info.transpose: true, false
+     *                                     lhs_info.interleave: true, false
+     * @param[in]  reinterpret_input_as_3d (Optional) True if the input has to be reinterpreted as 3D tensor
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d = false);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context         The compile context to be used.
+     * @param[in]  input                   Input tensor. Data types supported: All
+     * @param[out] output                  Output tensor. Data type supported: same as @p input
+     * @param[in]  lhs_info                LHS matrix information to be used for reshaping. This object contains all the necessary
+     *                                     information to reshape the input tensor. Only the following values are supported:
+     *                                     lhs_info.m0: 2,3,4,5,6,7,8
+     *                                     lhs_info.k0: 2,3,4,8,16
+     *                                     lhs_info.v0: greater than 0
+     *                                     lhs_info.transpose: true, false
+     *                                     lhs_info.interleave: true, false
+     * @param[in]  reinterpret_input_as_3d (Optional) True if the input has to be reinterpreted as 3D tensor
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d = false);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMReshapeLHSMatrixKernel
+     *
+     * @param[in] input                   Input tensor info. Data types supported: All
+     * @param[in] output                  Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
+     * @param[in] lhs_info                LHS matrix information to be used for reshaping. This object contains all the necessary
+     *                                    information to reshape the input tensor. Only the following values are supported:
+     *                                    lhs_info.m0: 2,3,4,5,6,7,8
+     *                                    lhs_info.k0: 2,3,4,8,16
+     *                                    lhs_info.v0: greater than 0
+     *                                    lhs_info.transpose: true, false
+     *                                    lhs_info.interleave: true, false
+     * @param[in] reinterpret_input_as_3d True if the input has to be reinterpreted as 3D tensor
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d);
+
+    // Inherited methods overridden
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    bool             _reinterpret_input_as_3d;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLGEMMRESHAPELHSMATRIXKERNEL_H */
\ No newline at end of file

diff --git a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp
index d94e834..33de61e 100644
--- a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h
new file mode 100644
index 0000000..911484e
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h

@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMRESHAPERHSMATRIXKERNEL_H
+#define ARM_COMPUTE_CLGEMMRESHAPERHSMATRIXKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to reshape the RHS matrix when performing the matrix multiplication
+ *  In particular, this kernel splits the input matrix in blocks of size K0xN0 and stores each one in
+ *  the output matrix unrolling the values */
+class CLGEMMReshapeRHSMatrixKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLGEMMReshapeRHSMatrixKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMReshapeRHSMatrixKernel(const CLGEMMReshapeRHSMatrixKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMReshapeRHSMatrixKernel &operator=(const CLGEMMReshapeRHSMatrixKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMReshapeRHSMatrixKernel(CLGEMMReshapeRHSMatrixKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMReshapeRHSMatrixKernel &operator=(CLGEMMReshapeRHSMatrixKernel &&) = default;
+    /** Default destructor */
+    ~CLGEMMReshapeRHSMatrixKernel() = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor,
+     *       required to create a OpenCL image object from buffer in @ref CLGEMMMatrixMultiplyReshapedKernel and in @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
+     *       Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required:
+     *       -# rhs_info.n0 can only be 4, 8 and 16
+     *       -# rhs_info.k0 can only be 4, 8 and 16
+     *       -# Data type can only be F32, F16
+     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+     *       -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+     *       -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+     *       -# The output tensor should be only consumed by @ref CLGEMMMatrixMultiplyReshapedKernel or @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
+     *
+     * @param[in]  input    Input tensor. Data types supported: All
+     * @param[out] output   Output tensor. Data type supported: same as @p input
+     * @param[in]  rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary
+     *                      information to reshape the input tensor. Only the following values are supported:
+     *                      rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
+     *                      rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false), (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
+     *                      rhs_info.h0: greater than 0
+     *                      rhs_info.transpose: true, false
+     *                      rhs_info.interleave: true, false
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info);
+    /** Initialise the kernel's input and output.
+     *
+     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor,
+     *       required to create a OpenCL image object from buffer in @ref CLGEMMMatrixMultiplyReshapedKernel and in @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
+     *       Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required:
+     *       -# rhs_info.n0 can only be 4, 8 and 16
+     *       -# rhs_info.k0 can only be 4, 8 and 16
+     *       -# Data type can only be F32, F16
+     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+     *       -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+     *       -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+     *       -# The output tensor should be only consumed by @ref CLGEMMMatrixMultiplyReshapedKernel or @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor. Data types supported: All
+     * @param[out] output          Output tensor. Data type supported: same as @p input
+     * @param[in]  rhs_info        RHS matrix information to be used for reshaping. This object contains all the necessary
+     *                             information to reshape the input tensor. Only the following values are supported:
+     *                             rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
+     *                             rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false), (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
+     *                             rhs_info.h0: greater than 0
+     *                             rhs_info.transpose: true, false
+     *                             rhs_info.interleave: true, false
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMReshapeRHSMatrixKernel
+     *
+     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor,
+     *       required to create a OpenCL image object from buffer in @ref CLGEMMMatrixMultiplyReshapedKernel and in @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
+     *       Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required:
+     *       -# rhs_info.n0 can only be 4, 8 and 16
+     *       -# rhs_info.k0 can only be 4, 8 and 16
+     *       -# Data type can only be F32, F16
+     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+     *       -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+     *       -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+     *       -# The output tensor should be only consumed by @ref CLGEMMMatrixMultiplyReshapedKernel or @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
+     *
+     * @param[in] input    Input tensor info. Data types supported: All
+     * @param[in] output   Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
+     * @param[in] rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary
+     *                     information to reshape the input tensor. Only the following values are supported:
+     *                     rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
+     *                     rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false),(only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
+     *                     rhs_info.h0: greater than 0
+     *                     rhs_info.transpose: true, false
+     *                     rhs_info.interleave: true, false
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const GEMMRHSMatrixInfo &rhs_info);
+
+    // Inherited methods overridden
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLGEMMRESHAPERHSMATRIXKERNEL_H */
\ No newline at end of file

diff --git a/src/core/CL/kernels/CLGatherKernel.cpp b/src/core/CL/kernels/CLGatherKernel.cpp
index a8508be..9e802c2 100644
--- a/src/core/CL/kernels/CLGatherKernel.cpp
+++ b/src/core/CL/kernels/CLGatherKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
+#include "src/core/CL/kernels/CLGatherKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/helpers/AutoConfiguration.h"

diff --git a/src/core/CL/kernels/CLGatherKernel.h b/src/core/CL/kernels/CLGatherKernel.h
new file mode 100644
index 0000000..8f472a4
--- /dev/null
+++ b/src/core/CL/kernels/CLGatherKernel.h

@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGATHERKERNEL_H
+#define ARM_COMPUTE_CLGATHERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to perform tensor reshaping */
+class CLGatherKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLGatherKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGatherKernel(const CLGatherKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGatherKernel &operator=(const CLGatherKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGatherKernel(CLGatherKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGatherKernel &operator=(CLGatherKernel &&) = default;
+    /** Default destructor */
+    ~CLGatherKernel() = default;
+    /** Initialise the kernel's inputs and outputs
+     *
+     * @param[in]  input   Source tensor. Supported tensor rank: up to 4. Data type supported: All.
+     * @param[in]  indices Indices tensor. Supported tensor rank: up to 1. Must be one of the following types: U32/S32. Each value must be in range [0, input.shape[@p axis])
+     * @param[out] output  Destination tensor. Data type supported: Same as @p input
+     * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0
+     */
+    void configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
+    /** Initialise the kernel's inputs and outputs
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Supported tensor rank: up to 4. Data type supported: All.
+     * @param[in]  indices         Indices tensor. Supported tensor rank: up to 1. Must be one of the following types: U32/S32. Each value must be in range [0, input.shape[@p axis])
+     * @param[out] output          Destination tensor. Data type supported: Same as @p input
+     * @param[in]  axis            (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGatherKernel
+     *
+     * @param[in] input   Source tensor info. Supported tensor rank: up to 4. Data type supported: All.
+     * @param[in] indices Indices tensor info. Supported tensor rank: up to 4. Must be one of the following types: U32/S32. Each value must be in range [0, input.shape[@p axis])
+     * @param[in] output  Destination tensor info. Data type supported: Same as @p input
+     * @param[in] axis    (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;   /**< Source tensor */
+    const ICLTensor *_indices; /**< Indices tensor */
+    ICLTensor       *_output;  /**< Destination tensor */
+    int              _axis;    /**< Axis index */
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGATHERKERNEL_H */

diff --git a/src/core/CL/kernels/CLGaussian3x3Kernel.cpp b/src/core/CL/kernels/CLGaussian3x3Kernel.cpp
index c9ed1ac..40e9658 100644
--- a/src/core/CL/kernels/CLGaussian3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLGaussian3x3Kernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h"
+#include "src/core/CL/kernels/CLGaussian3x3Kernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLGaussian3x3Kernel.h b/src/core/CL/kernels/CLGaussian3x3Kernel.h
new file mode 100644
index 0000000..139b05d
--- /dev/null
+++ b/src/core/CL/kernels/CLGaussian3x3Kernel.h

@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGAUSSIAN3X3KERNEL_H
+#define ARM_COMPUTE_CLGAUSSIAN3X3KERNEL_H
+
+#include "src/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the Gaussian 3x3 filter kernel.
+ *
+ */
+class CLGaussian3x3Kernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input            An input tensor. Data types supported: U8
+     * @param[out] output           The output tensor. Data types supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  input            An input tensor. Data types supported: U8
+     * @param[out] output           The output tensor. Data types supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGAUSSIAN3X3KERNEL_H */

diff --git a/src/core/CL/kernels/CLGaussian5x5Kernel.cpp b/src/core/CL/kernels/CLGaussian5x5Kernel.cpp
index 5b3639f..46a7576 100644
--- a/src/core/CL/kernels/CLGaussian5x5Kernel.cpp
+++ b/src/core/CL/kernels/CLGaussian5x5Kernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h"
+#include "src/core/CL/kernels/CLGaussian5x5Kernel.h"
 
 #include <cstdint>
 

diff --git a/src/core/CL/kernels/CLGaussian5x5Kernel.h b/src/core/CL/kernels/CLGaussian5x5Kernel.h
new file mode 100644
index 0000000..711710b
--- /dev/null
+++ b/src/core/CL/kernels/CLGaussian5x5Kernel.h

@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGAUSSIAN5X5KERNEL_H
+#define ARM_COMPUTE_CLGAUSSIAN5X5KERNEL_H
+
+#include "src/core/CL/kernels/CLConvolutionKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to run the horizontal pass of 5x5 Gaussian filter on a tensor. */
+class CLGaussian5x5HorKernel : public CLSeparableConvolution5x5HorKernel
+{
+public:
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output           Destination tensor. Data types supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output           Destination tensor. Data types supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+private:
+    //Make the configure method of the parent class private
+    using CLSeparableConvolution5x5HorKernel::configure;
+};
+
+/** Interface for the kernel to run the vertical pass of 5x5 Gaussian filter on a tensor. */
+class CLGaussian5x5VertKernel : public CLSeparableConvolution5x5VertKernel
+{
+public:
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @param[in]  input            Input tensor(output of horizontal pass). Data types supported: S16.
+     * @param[out] output           Destination tensor. Data types supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  input            Input tensor(output of horizontal pass). Data types supported: S16.
+     * @param[out] output           Destination tensor. Data types supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+private:
+    //Make the configure method of the parent class private
+    using CLSeparableConvolution5x5VertKernel::configure;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGAUSSIAN5X5KERNEL_H */

diff --git a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
index 2686e8b..065f7f7 100644
--- a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
+++ b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h"
+#include "src/core/CL/kernels/CLGaussianPyramidKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLGaussianPyramidKernel.h b/src/core/CL/kernels/CLGaussianPyramidKernel.h
new file mode 100644
index 0000000..a659544
--- /dev/null
+++ b/src/core/CL/kernels/CLGaussianPyramidKernel.h

@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H
+#define ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H
+
+#include "src/core/CL/ICLSimpleKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform a Gaussian filter and half scaling across width (horizontal pass) */
+class CLGaussianPyramidHorKernel : public ICLSimpleKernel
+{
+public:
+    /** Default constructor */
+    CLGaussianPyramidHorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGaussianPyramidHorKernel(const CLGaussianPyramidHorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGaussianPyramidHorKernel &operator=(const CLGaussianPyramidHorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGaussianPyramidHorKernel(CLGaussianPyramidHorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGaussianPyramidHorKernel &operator=(CLGaussianPyramidHorKernel &&) = default;
+    /** Default destructor */
+    ~CLGaussianPyramidHorKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  input  Source tensor. Data types supported: U8.
+     * @param[out] output Destination tensor. Output should have half the input width. Data types supported: U16.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: U8.
+     * @param[out] output          Destination tensor. Output should have half the input width. Data types supported: U16.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    int _l2_load_offset;
+};
+
+/** OpenCL kernel to perform a Gaussian filter and half scaling across height (vertical pass) */
+class CLGaussianPyramidVertKernel : public ICLSimpleKernel
+{
+public:
+    /** Default constructor */
+    CLGaussianPyramidVertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGaussianPyramidVertKernel(const CLGaussianPyramidVertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGaussianPyramidVertKernel &operator=(const CLGaussianPyramidVertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGaussianPyramidVertKernel(CLGaussianPyramidVertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGaussianPyramidVertKernel &operator=(CLGaussianPyramidVertKernel &&) = default;
+    /** Default destructor */
+    ~CLGaussianPyramidVertKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  input  Source tensor. Data types supported: U16.
+     * @param[out] output Destination tensor. Output should have half the input height. Data types supported: U8.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: U16.
+     * @param[out] output          Destination tensor. Output should have half the input height. Data types supported: U8.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    int _t2_load_offset;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H */

diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
index a2fcbba..dd3faf5 100644
--- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
+++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
+#include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h
new file mode 100644
index 0000000..d26795a
--- /dev/null
+++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h

@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGENERATEPROPOSALSLAYERKERNEL_H
+#define ARM_COMPUTE_CLGENERATEPROPOSALSLAYERKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for Compute All Anchors kernel */
+class CLComputeAllAnchorsKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLComputeAllAnchorsKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLComputeAllAnchorsKernel(const CLComputeAllAnchorsKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLComputeAllAnchorsKernel &operator=(const CLComputeAllAnchorsKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLComputeAllAnchorsKernel(CLComputeAllAnchorsKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLComputeAllAnchorsKernel &operator=(CLComputeAllAnchorsKernel &&) = default;
+    /** Default destructor */
+    ~CLComputeAllAnchorsKernel() = default;
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  anchors     Source tensor. Original set of anchors of size (4, A), where A is the number of anchors. Data types supported: QSYMM16/F16/F32
+     * @param[out] all_anchors Destination tensor. Destination anchors of size (4, H*W*A) where H and W are the height and width of the feature map and A is the number of anchors. Data types supported: Same as @p input
+     * @param[in]  info        Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
+     *
+     */
+    void configure(const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info);
+    /** Set the input and output tensors.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  anchors         Source tensor. Original set of anchors of size (4, A), where A is the number of anchors. Data types supported: QSYMM16/F16/F32
+     * @param[out] all_anchors     Destination tensor. Destination anchors of size (4, H*W*A) where H and W are the height and width of the feature map and A is the number of anchors. Data types supported: Same as @p input
+     * @param[in]  info            Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
+     *
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLComputeAllAnchorsKernel
+     *
+     * @param[in] anchors     Source tensor info. Original set of anchors of size (4, A), where A is the number of anchors. Data types supported: QSYMM16/F16/F32
+     * @param[in] all_anchors Destination tensor info. Destination anchors of size (4, H*W*A) where H and W are the height and width of the feature map and A is the number of anchors. Data types supported: Same as @p input
+     * @param[in] info        Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
+     *
+     * @return a Status
+     */
+    static Status validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_anchors;
+    ICLTensor       *_all_anchors;
+};
+} // arm_compute
+#endif // ARM_COMPUTE_CLGENERATEPROSPOSALSLAYERKERNEL_H

diff --git a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
index eaf5ea4..cd3f1ee 100644
--- a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
+++ b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
+#include "src/core/CL/kernels/CLHOGDescriptorKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLHOGDescriptorKernel.h b/src/core/CL/kernels/CLHOGDescriptorKernel.h
new file mode 100644
index 0000000..eee2fa3
--- /dev/null
+++ b/src/core/CL/kernels/CLHOGDescriptorKernel.h

@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H
+#define ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H
+
+#include "arm_compute/core/IHOG.h"
+#include "arm_compute/core/Size2D.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** OpenCL kernel to perform HOG Orientation Binning */
+class CLHOGOrientationBinningKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLHOGOrientationBinningKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGOrientationBinningKernel(const CLHOGOrientationBinningKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGOrientationBinningKernel &operator=(const CLHOGOrientationBinningKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLHOGOrientationBinningKernel(CLHOGOrientationBinningKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLHOGOrientationBinningKernel &operator=(CLHOGOrientationBinningKernel &&) = default;
+    /** Default destructor */
+    ~CLHOGOrientationBinningKernel() = default;
+
+    /**  Initialise the kernel's inputs, output and HOG's metadata
+     *
+     * @param[in]  input_magnitude Input tensor which stores the magnitude of the gradient for each pixel. Data type supported: S16.
+     * @param[in]  input_phase     Input tensor which stores the phase of the gradient for each pixel. Data type supported: U8
+     * @param[out] output          Output tensor which stores the local HOG for each cell. DataType supported: F32. Number of channels supported: equal to the number of histogram bins per cell
+     * @param[in]  hog_info        HOG's metadata
+     */
+    void configure(const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info);
+    /**  Initialise the kernel's inputs, output and HOG's metadata
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input_magnitude Input tensor which stores the magnitude of the gradient for each pixel. Data type supported: S16.
+     * @param[in]  input_phase     Input tensor which stores the phase of the gradient for each pixel. Data type supported: U8
+     * @param[out] output          Output tensor which stores the local HOG for each cell. DataType supported: F32. Number of channels supported: equal to the number of histogram bins per cell
+     * @param[in]  hog_info        HOG's metadata
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input_magnitude;
+    const ICLTensor *_input_phase;
+    ICLTensor       *_output;
+    Size2D           _cell_size;
+};
+
+/** OpenCL kernel to perform HOG block normalization */
+class CLHOGBlockNormalizationKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLHOGBlockNormalizationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGBlockNormalizationKernel(const CLHOGBlockNormalizationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGBlockNormalizationKernel &operator=(const CLHOGBlockNormalizationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLHOGBlockNormalizationKernel(CLHOGBlockNormalizationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLHOGBlockNormalizationKernel &operator=(CLHOGBlockNormalizationKernel &&) = default;
+    /** Default destructor */
+    ~CLHOGBlockNormalizationKernel() = default;
+
+    /** Initialise the kernel's input, output and HOG's metadata
+     *
+     * @param[in]  input    Input tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
+     * @param[out] output   Output tensor which stores the normalised blocks. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
+     * @param[in]  hog_info HOG's metadata
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info);
+    /** Initialise the kernel's input, output and HOG's metadata
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
+     * @param[out] output          Output tensor which stores the normalised blocks. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
+     * @param[in]  hog_info        HOG's metadata
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    Size2D           _num_cells_per_block_stride;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H */

diff --git a/src/core/CL/kernels/CLHOGDetectorKernel.cpp b/src/core/CL/kernels/CLHOGDetectorKernel.cpp
index 6e14996..861155b 100644
--- a/src/core/CL/kernels/CLHOGDetectorKernel.cpp
+++ b/src/core/CL/kernels/CLHOGDetectorKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
+#include "src/core/CL/kernels/CLHOGDetectorKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLHOGDetectorKernel.h b/src/core/CL/kernels/CLHOGDetectorKernel.h
new file mode 100644
index 0000000..c28e6eb
--- /dev/null
+++ b/src/core/CL/kernels/CLHOGDetectorKernel.h

@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLHOGDETECTORKERNEL_H
+#define ARM_COMPUTE_CLHOGDETECTORKERNEL_H
+
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLHOG.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace cl
+{
+class Buffer;
+}
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform HOG detector kernel using linear SVM */
+class CLHOGDetectorKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLHOGDetectorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGDetectorKernel(const CLHOGDetectorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGDetectorKernel &operator=(const CLHOGDetectorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLHOGDetectorKernel(CLHOGDetectorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLHOGDetectorKernel &operator=(CLHOGDetectorKernel &&) = default;
+    /** Default destructor */
+    ~CLHOGDetectorKernel() = default;
+
+    /** Initialise the kernel's input, HOG data-object, detection window, the stride of the detection window, the threshold and index of the object to detect
+     *
+     * @param[in]  input                   Input tensor which stores the HOG descriptor obtained with @ref CLHOGOrientationBinningKernel. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
+     * @param[in]  hog                     HOG data object used by @ref CLHOGOrientationBinningKernel and  @ref CLHOGBlockNormalizationKernel
+     * @param[out] detection_windows       Array of @ref DetectionWindow. This array stores all the detected objects
+     * @param[in]  num_detection_windows   Number of detected objects
+     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
+     *                                     It must be multiple of the hog->info()->block_stride()
+     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
+     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
+     */
+    void configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows, const Size2D &detection_window_stride, float threshold = 0.0f,
+                   uint16_t idx_class = 0);
+    /** Initialise the kernel's input, HOG data-object, detection window, the stride of the detection window, the threshold and index of the object to detect
+     *
+     * @param[in]  compile_context         The compile context to be used.
+     * @param[in]  input                   Input tensor which stores the HOG descriptor obtained with @ref CLHOGOrientationBinningKernel. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
+     * @param[in]  hog                     HOG data object used by @ref CLHOGOrientationBinningKernel and  @ref CLHOGBlockNormalizationKernel
+     * @param[out] detection_windows       Array of @ref DetectionWindow. This array stores all the detected objects
+     * @param[in]  num_detection_windows   Number of detected objects
+     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
+     *                                     It must be multiple of the hog->info()->block_stride()
+     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
+     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows,
+                   const Size2D &detection_window_stride, float threshold = 0.0f,
+                   uint16_t idx_class = 0);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue);
+
+private:
+    const ICLTensor         *_input;
+    ICLDetectionWindowArray *_detection_windows;
+    cl::Buffer              *_num_detection_windows;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLHOGDETECTORKERNEL_H */

diff --git a/src/core/CL/kernels/CLHarrisCornersKernel.cpp b/src/core/CL/kernels/CLHarrisCornersKernel.cpp
index 19c4e57..cbc056f 100644
--- a/src/core/CL/kernels/CLHarrisCornersKernel.cpp
+++ b/src/core/CL/kernels/CLHarrisCornersKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
+#include "src/core/CL/kernels/CLHarrisCornersKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLHarrisCornersKernel.h b/src/core/CL/kernels/CLHarrisCornersKernel.h
new file mode 100644
index 0000000..6482b0a
--- /dev/null
+++ b/src/core/CL/kernels/CLHarrisCornersKernel.h

@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLHARRISCORNERSKERNEL_H
+#define ARM_COMPUTE_CLHARRISCORNERSKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Interface for the harris score kernel.
+ *
+ * @note The implementation supports 3, 5, and 7 for the block_size.
+ */
+class CLHarrisScoreKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLHarrisScoreKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHarrisScoreKernel(const CLHarrisScoreKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHarrisScoreKernel &operator=(const CLHarrisScoreKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLHarrisScoreKernel(CLHarrisScoreKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLHarrisScoreKernel &operator=(CLHarrisScoreKernel &&) = default;
+    /** Default destructor */
+    ~CLHarrisScoreKernel() = default;
+
+    /** Setup the kernel parameters
+     *
+     * @param[in]  input1           Source image (gradient X). Data types supported S16, S32. (Must be the same as input2)
+     * @param[in]  input2           Source image (gradient Y). Data types supported S16, S32. (Must be the same as input1)
+     * @param[out] output           Destination image (harris score). Data types supported F32
+     * @param[in]  block_size       The block window size used to compute the Harris Corner score.  Supports: 3, 5 and 7
+     * @param[in]  norm_factor      Normalization factor to use accordingly with the gradient size (Must be different from 0)
+     * @param[in]  strength_thresh  Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
+     * @param[in]  sensitivity      Sensitivity threshold k from the Harris-Stephens equation.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLImage *input1, const ICLImage *input2, ICLImage *output,
+                   int32_t block_size, float norm_factor, float strength_thresh, float sensitivity,
+                   bool border_undefined);
+    /** Setup the kernel parameters
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  input1           Source image (gradient X). Data types supported S16, S32. (Must be the same as input2)
+     * @param[in]  input2           Source image (gradient Y). Data types supported S16, S32. (Must be the same as input1)
+     * @param[out] output           Destination image (harris score). Data types supported F32
+     * @param[in]  block_size       The block window size used to compute the Harris Corner score.  Supports: 3, 5 and 7
+     * @param[in]  norm_factor      Normalization factor to use accordingly with the gradient size (Must be different from 0)
+     * @param[in]  strength_thresh  Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
+     * @param[in]  sensitivity      Sensitivity threshold k from the Harris-Stephens equation.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLImage *input1, const ICLImage *input2, ICLImage *output,
+                   int32_t block_size, float norm_factor, float strength_thresh, float sensitivity,
+                   bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+protected:
+    const ICLImage *_input1;          /**< Source image - Gx component */
+    const ICLImage *_input2;          /**< Source image - Gy component */
+    ICLImage       *_output;          /**< Source image - Harris score */
+    float           _sensitivity;     /**< Sensitivity value */
+    float           _strength_thresh; /**< Threshold value */
+    float           _norm_factor;     /**< Normalization factor */
+    BorderSize      _border_size;     /**< Border size */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLHARRISCORNERSKERNEL_H */

diff --git a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp
index 3f5e91e..8aa7366 100644
--- a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
+#include "src/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.h b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.h
new file mode 100644
index 0000000..f4cb627
--- /dev/null
+++ b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.h

@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_CLHEIGHTCONCATENATELAYERKERNEL_H
+#define ARM_COMPUTE_CLHEIGHTCONCATENATELAYERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+/** Interface for the height concatenate kernel.
+ *  The input tensor will be concatenated into the output tensor.
+ */
+class CLHeightConcatenateLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLHeightConcatenateLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHeightConcatenateLayerKernel(const CLHeightConcatenateLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHeightConcatenateLayerKernel &operator=(const CLHeightConcatenateLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLHeightConcatenateLayerKernel(CLHeightConcatenateLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLHeightConcatenateLayerKernel &operator=(CLHeightConcatenateLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLHeightConcatenateLayerKernel() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor. Data types supported: All.
+     * @param[in]  height_offset   The starting offset on the Y axis for the output tensor.
+     * @param[out] output          Output tensor. Data types supported: Same as @p input.
+     *
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int height_offset, ITensorInfo *output);
+    /**  Static function to check if given info will lead to a valid configuration of @ref CLHeightConcatenateLayerKernel
+     *
+     * @param[in] input         Input tensor info. Data types supported: All.
+     * @param[in] height_offset The starting offset on the Y axis for the output tensor.
+     * @param[in] output        Output tensor info. Data types supported: Same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    unsigned int _height_offset;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLHEIGHTCONCATENATELAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLHistogramKernel.cpp b/src/core/CL/kernels/CLHistogramKernel.cpp
index a85429c..ca5322a 100644
--- a/src/core/CL/kernels/CLHistogramKernel.cpp
+++ b/src/core/CL/kernels/CLHistogramKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLHistogramKernel.h"
+#include "src/core/CL/kernels/CLHistogramKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLDistribution1D.h"

diff --git a/src/core/CL/kernels/CLHistogramKernel.h b/src/core/CL/kernels/CLHistogramKernel.h
new file mode 100644
index 0000000..9c97c65
--- /dev/null
+++ b/src/core/CL/kernels/CLHistogramKernel.h

@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLHISTOGRAMKERNEL_H
+#define ARM_COMPUTE_CLHISTOGRAMKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLDistribution1D;
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Interface to run the histogram kernel. This kernel processes the part of image with width can be divided by 16.
+ *  If the image width is not a multiple of 16, remaining pixels have to be processed with the @ref CLHistogramBorderKernel
+ */
+class CLHistogramKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLHistogramKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHistogramKernel(const CLHistogramKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHistogramKernel &operator=(const CLHistogramKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLHistogramKernel(CLHistogramKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLHistogramKernel &operator=(CLHistogramKernel &&) = default;
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input  Source image. Data types supported: U8.
+     * @param[out] output Destination distribution.
+     */
+    void configure(const ICLImage *input, ICLDistribution1D *output);
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source image. Data types supported: U8.
+     * @param[out] output          Destination distribution.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLImage    *_input;
+    ICLDistribution1D *_output;
+};
+
+/** Interface to run the histogram kernel to handle the leftover part of image
+ *
+ */
+class CLHistogramBorderKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLHistogramBorderKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHistogramBorderKernel(const CLHistogramBorderKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHistogramBorderKernel &operator=(const CLHistogramBorderKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLHistogramBorderKernel(CLHistogramBorderKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLHistogramBorderKernel &operator=(CLHistogramBorderKernel &&) = default;
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input  Source image. Data types supported: U8.
+     * @param[out] output Destination distribution.
+     */
+    void configure(const ICLImage *input, ICLDistribution1D *output);
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source image. Data types supported: U8.
+     * @param[out] output          Destination distribution.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLImage    *_input;
+    ICLDistribution1D *_output;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLHISTOGRAMKERNEL_H*/

diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 76490f8..0789cdc 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
+#include "src/core/CL/kernels/CLIm2ColKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLIm2ColKernel.h b/src/core/CL/kernels/CLIm2ColKernel.h
new file mode 100644
index 0000000..2920c7d
--- /dev/null
+++ b/src/core/CL/kernels/CLIm2ColKernel.h

@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLIM2COLKERNEL_H
+#define ARM_COMPUTE_CLIM2COLKERNEL_H
+
+#include "arm_compute/core/Size2D.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the im2col reshape kernel.
+ *
+ * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column.
+ * It is used to transform a convolution to a plain matrix multiplication.
+ *
+ * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have:
+ * @f[
+ * \left( \begin{array}{cccc}
+ * a00 & a01 & a02 & a03 \\
+ * a10 & a11 & a12 & a13 \\
+ * a20 & a21 & a22 & a23 \\
+ * a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * =
+ * \left( \begin{array}{ccccccccc}
+ * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\
+ * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\
+ * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\
+ * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * @f]
+ */
+class CLIm2ColKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLIm2ColKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLIm2ColKernel(const CLIm2ColKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLIm2ColKernel &operator=(const CLIm2ColKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLIm2ColKernel(CLIm2ColKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLIm2ColKernel &operator=(CLIm2ColKernel &&) = default;
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
+     *                         while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[out] output      The output tensor. First 2 lower dimensions represent a transform of each 3D input,
+     *                         while every dimension above represents a batch. Data types supported: Same as @p input
+     * @param[in]  kernel_dims The kernel dimensions (width and height).
+     * @param[in]  conv_info   Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  has_bias    In case biases are provided expands the matrix with 1.
+     *                         This is valid only for non-quantized inputs.
+     * @param[in]  dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  num_groups  (Optional) Number of groups when performing a grouped convolution.
+     *                         Number of groups other than 1 is only supported for NCHW data layout.
+     *                         Number of groups should be multiple to the number of channels.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U),
+                   unsigned int num_groups = 1);
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
+     *                             while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[out] output          The output tensor. First 2 lower dimensions represent a transform of each 3D input,
+     *                             while every dimension above represents a batch. Data types supported: Same as @p input
+     * @param[in]  kernel_dims     The kernel dimensions (width and height).
+     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  has_bias        In case biases are provided expands the matrix with 1.
+     * @param[in]  dilation        (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias,
+                   const Size2D &dilation   = Size2D(1U, 1U),
+                   unsigned int  num_groups = 1);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLIm2ColKernel
+     *
+     * @param[in] input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
+     *                        while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[in] output      The output tensor. First 2 lower dimensions represent a transform of each 3D input,
+     *                        while every dimension above represents a batch. Data types supported: Same as @p input
+     * @param[in] kernel_dims The kernel dimensions (width and height).
+     * @param[in] conv_info   Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in] has_bias    In case biases are provided expands the matrix with 1.
+     *                        This is valid only for non-quantized inputs.
+     * @param[in] dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in] num_groups  (Optional) Number of groups when performing a grouped convolution.
+     *                        Number of groups other than 1 is only supported for NCHW data layout.
+     *                        Number of groups should be multiple to the number of channels.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U),
+                           unsigned int num_groups = 1);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+public:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    DataLayout       _data_layout;
+    std::pair<unsigned int, unsigned int> _convolved_dims;
+    unsigned int  _num_elems_processed_per_iteration;
+    Size2D        _kernel_dims;
+    PadStrideInfo _conv_info;
+    unsigned int  _num_groups;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLIM2COLKERNEL_H */

diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
index e97b856..4c3b404 100644
--- a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
+#include "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
new file mode 100644
index 0000000..d4444f0
--- /dev/null
+++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h

@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H
+#define ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ICLTensor;
+
+/** Interface for performing an instance normalization */
+class CLInstanceNormalizationLayerKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLInstanceNormalizationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLInstanceNormalizationLayerKernel(const CLInstanceNormalizationLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLInstanceNormalizationLayerKernel &operator=(const CLInstanceNormalizationLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    CLInstanceNormalizationLayerKernel(CLInstanceNormalizationLayerKernel &&) = default;
+    /** Default move assignment operator */
+    CLInstanceNormalizationLayerKernel &operator=(CLInstanceNormalizationLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLInstanceNormalizationLayerKernel() = default;
+
+    /** Set the input and output tensors.
+     *
+     * @param[in, out] input  Source tensor. Data types supported: F16/F32. Data layout supported: NCHW, NHWC
+     *                        In case of @p output tensor = nullptr this tensor will store the result of the normalization.
+     * @param[out]     output Destination tensor. Data types and data layouts supported: same as @p input.
+     * @param[in]      info   Kernel meta-data descriptor
+     */
+    void configure(ICLTensor *input, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info);
+    /** Set the input and output tensors.
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] input           Source tensor. Data types supported: F16/F32. Data layout supported: NCHW, NHWC
+     *                                 In case of @p output tensor = nullptr this tensor will store the result of the normalization.
+     * @param[out]     output          Destination tensor. Data types and data layouts supported: same as @p input.
+     * @param[in]      info            Kernel meta-data descriptor
+     */
+    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer.
+     *
+     * @param[in] input  Source tensor info. Data types supported: F16/F32. Data layout supported: NHWC, NCHW
+     * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p input.
+     * @param[in] info   Kernel meta-data descriptor
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    ICLTensor *_input;
+    ICLTensor *_output;
+    bool       _run_in_place;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLIntegralImageKernel.cpp b/src/core/CL/kernels/CLIntegralImageKernel.cpp
index 82f6da8..5e5683d 100644
--- a/src/core/CL/kernels/CLIntegralImageKernel.cpp
+++ b/src/core/CL/kernels/CLIntegralImageKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h"
+#include "src/core/CL/kernels/CLIntegralImageKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLIntegralImageKernel.h b/src/core/CL/kernels/CLIntegralImageKernel.h
new file mode 100644
index 0000000..0e40e3a
--- /dev/null
+++ b/src/core/CL/kernels/CLIntegralImageKernel.h

@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLINTEGRALIMAGEKERNEL_H
+#define ARM_COMPUTE_CLINTEGRALIMAGEKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface to run the horizontal pass of the integral image kernel. */
+class CLIntegralImageHorKernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  An input tensor. Data types supported: U8
+     * @param[out] output Destination tensor, Data types supported: U32.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           An input tensor. Data types supported: U8
+     * @param[out] output          Destination tensor, Data types supported: U32.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
+};
+
+/** Interface to run the vertical pass of the integral image kernel. */
+class CLIntegralImageVertKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLIntegralImageVertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLIntegralImageVertKernel(const CLIntegralImageVertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLIntegralImageVertKernel &operator=(const CLIntegralImageVertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLIntegralImageVertKernel(CLIntegralImageVertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLIntegralImageVertKernel &operator=(CLIntegralImageVertKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in,out] in_out The input/output tensor. Data types supported: U32
+     */
+    void configure(ICLTensor *in_out);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in,out] in_out          The input/output tensor. Data types supported: U32
+     */
+    void configure(const CLCompileContext &compile_context, ICLTensor *in_out);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    ICLTensor *_in_out;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLINTEGRALIMAGEKERNEL_H */

diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
index 9936e29..9e91d98 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h"
+#include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.h b/src/core/CL/kernels/CLL2NormalizeLayerKernel.h
new file mode 100644
index 0000000..edc0585
--- /dev/null
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.h

@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLL2NORMALIZELAYERKERNEL_H
+#define ARM_COMPUTE_CLL2NORMALIZELAYERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for performing a L2 normalize on a given axis given the square sum of it in this axis */
+class CLL2NormalizeLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLL2NormalizeLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLL2NormalizeLayerKernel(const CLL2NormalizeLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLL2NormalizeLayerKernel &operator=(const CLL2NormalizeLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLL2NormalizeLayerKernel(CLL2NormalizeLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLL2NormalizeLayerKernel &operator=(CLL2NormalizeLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLL2NormalizeLayerKernel() = default;
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input   Source tensor. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
+     * @param[in]  sum     Sum values tensor. Data types supported: same as @p input.
+     *                     Sum will have the same number of dimensions as input.
+     * @param[out] output  Destination tensor. Data types and data layouts supported: Same as @p input.
+     *                     Output will have the same number of dimensions as input.
+     * @param[in]  axis    Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
+     * @param[in]  epsilon Lower bound value for the normalization.
+     */
+    void configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon);
+    /** Set the input and output tensors.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
+     * @param[in]  sum             Sum values tensor. Data types supported: same as @p input.
+     *                             Sum will have the same number of dimensions as input.
+     * @param[out] output          Destination tensor. Data types and data layouts supported: Same as @p input.
+     *                             Output will have the same number of dimensions as input.
+     * @param[in]  axis            Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
+     * @param[in]  epsilon         Lower bound value for the normalization.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLL2NormalizeLayerKernel.
+     *
+     * @param[in] input   Source tensor info. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
+     * @param[in] sum     Sum values tensor info. Data types supported: same as @p input.
+     *                    Sum will have the same number of dimensions as input.
+     * @param[in] output  Destination tensor info. Data types and data layouts supported: Same as @p input.
+     *                    Output will have the same number of dimensions as input.
+     * @param[in] axis    Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
+     * @param[in] epsilon Lower bound value for the normalization.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    const ICLTensor *_sum;
+    ICLTensor       *_output;
+    unsigned int     _actual_axis;
+    float            _epsilon;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLL2NORMALIZELAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLLKTrackerKernel.cpp b/src/core/CL/kernels/CLLKTrackerKernel.cpp
index 0fa2e70..a439c24 100644
--- a/src/core/CL/kernels/CLLKTrackerKernel.cpp
+++ b/src/core/CL/kernels/CLLKTrackerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
+#include "src/core/CL/kernels/CLLKTrackerKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLArray.h"

diff --git a/src/core/CL/kernels/CLLKTrackerKernel.h b/src/core/CL/kernels/CLLKTrackerKernel.h
new file mode 100644
index 0000000..2d29668
--- /dev/null
+++ b/src/core/CL/kernels/CLLKTrackerKernel.h

@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLLKTRACKERKERNEL_H
+#define ARM_COMPUTE_CLLKTRACKERKERNEL_H
+
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface to run the initialization step of LKTracker */
+class CLLKTrackerInitKernel : public ICLKernel
+{
+public:
+    /** Initialise the kernel input and output
+     *
+     * @param[in]  old_points           Pointer to the @ref ICLKeyPointArray storing old key points
+     * @param[in]  new_points_estimates Pointer to the @ref ICLKeyPointArray storing new estimates key points
+     * @param[out] old_points_internal  Pointer to the array of internal @ref CLLKInternalKeypoint old points
+     * @param[out] new_points_internal  Pointer to the array of internal @ref CLLKInternalKeypoint new points
+     * @param[in]  use_initial_estimate The flag to indicate whether the initial estimated position should be used
+     * @param[in]  level                The pyramid level
+     * @param[in]  num_levels           The number of pyramid levels
+     * @param[in]  pyramid_scale        Scale factor used for generating the pyramid
+     */
+    void configure(const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates,
+                   ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
+                   bool use_initial_estimate, size_t level, size_t num_levels, float pyramid_scale);
+    /** Initialise the kernel input and output
+     *
+     * @param[in]  compile_context      The compile context to be used.
+     * @param[in]  old_points           Pointer to the @ref ICLKeyPointArray storing old key points
+     * @param[in]  new_points_estimates Pointer to the @ref ICLKeyPointArray storing new estimates key points
+     * @param[out] old_points_internal  Pointer to the array of internal @ref CLLKInternalKeypoint old points
+     * @param[out] new_points_internal  Pointer to the array of internal @ref CLLKInternalKeypoint new points
+     * @param[in]  use_initial_estimate The flag to indicate whether the initial estimated position should be used
+     * @param[in]  level                The pyramid level
+     * @param[in]  num_levels           The number of pyramid levels
+     * @param[in]  pyramid_scale        Scale factor used for generating the pyramid
+     */
+    void configure(const CLCompileContext &compile_context, const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates,
+                   ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
+                   bool use_initial_estimate, size_t level, size_t num_levels, float pyramid_scale);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+};
+
+/** Interface to run the finalize step of LKTracker, where it truncates the coordinates stored in new_points array */
+class CLLKTrackerFinalizeKernel : public ICLKernel
+{
+public:
+    /** Initialise the kernel input and output
+     *
+     * @param[in]  new_points_internal Pointer to the array of internal @ref CLLKInternalKeypoint new points
+     * @param[out] new_points          Pointer to the @ref ICLKeyPointArray storing new key points
+     */
+    void configure(ICLLKInternalKeypointArray *new_points_internal, ICLKeyPointArray *new_points);
+    /** Initialise the kernel input and output
+     *
+     * @param[in]  compile_context     The compile context to be used.
+     * @param[in]  new_points_internal Pointer to the array of internal @ref CLLKInternalKeypoint new points
+     * @param[out] new_points          Pointer to the @ref ICLKeyPointArray storing new key points
+     */
+    void configure(const CLCompileContext &compile_context, ICLLKInternalKeypointArray *new_points_internal, ICLKeyPointArray *new_points);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+};
+
+/** Interface to run the first stage of LKTracker, where A11, A12, A22, min_eig, ival, ixval and iyval are computed */
+class CLLKTrackerStage0Kernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLLKTrackerStage0Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLKTrackerStage0Kernel(const CLLKTrackerStage0Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLKTrackerStage0Kernel &operator=(const CLLKTrackerStage0Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLLKTrackerStage0Kernel(CLLKTrackerStage0Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLLKTrackerStage0Kernel &operator=(CLLKTrackerStage0Kernel &&) = default;
+    /** Initialise the kernel input and output
+     *
+     * @param[in]      old_input           Pointer to the input old tensor. Data types supported: U8
+     * @param[in]      old_scharr_gx       Pointer to the input scharr X tensor. Data types supported: S16
+     * @param[in]      old_scharr_gy       Pointer to the input scharr Y tensor. Data types supported: S16
+     * @param[in]      old_points_internal Pointer to the array of CLLKInternalKeypoint old points
+     * @param[in, out] new_points_internal Pointer to the array of CLLKInternalKeypoint new points
+     * @param[out]     coeff_table         Pointer to the array holding the Spatial Gradient coefficients
+     * @param[out]     old_ival            Pointer to the array holding internal values
+     * @param[in]      window_dimension    The size of the window on which to perform the algorithm
+     * @param[in]      level               The pyramid level
+     */
+    void configure(const ICLTensor *old_input, const ICLTensor *old_scharr_gx, const ICLTensor *old_scharr_gy,
+                   ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
+                   ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
+                   size_t window_dimension, size_t level);
+    /** Initialise the kernel input and output
+     *
+     * @param[in]      compile_context     The compile context to be used.
+     * @param[in]      old_input           Pointer to the input old tensor. Data types supported: U8
+     * @param[in]      old_scharr_gx       Pointer to the input scharr X tensor. Data types supported: S16
+     * @param[in]      old_scharr_gy       Pointer to the input scharr Y tensor. Data types supported: S16
+     * @param[in]      old_points_internal Pointer to the array of CLLKInternalKeypoint old points
+     * @param[in, out] new_points_internal Pointer to the array of CLLKInternalKeypoint new points
+     * @param[out]     coeff_table         Pointer to the array holding the Spatial Gradient coefficients
+     * @param[out]     old_ival            Pointer to the array holding internal values
+     * @param[in]      window_dimension    The size of the window on which to perform the algorithm
+     * @param[in]      level               The pyramid level
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *old_input, const ICLTensor *old_scharr_gx, const ICLTensor *old_scharr_gy,
+                   ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
+                   ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
+                   size_t window_dimension, size_t level);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_old_input;
+    const ICLTensor *_old_scharr_gx;
+    const ICLTensor *_old_scharr_gy;
+};
+
+/** Interface to run the second stage of LKTracker, where the motion vectors of the given points are computed */
+class CLLKTrackerStage1Kernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLLKTrackerStage1Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLKTrackerStage1Kernel(const CLLKTrackerStage1Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLKTrackerStage1Kernel &operator=(const CLLKTrackerStage1Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLLKTrackerStage1Kernel(CLLKTrackerStage1Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLLKTrackerStage1Kernel &operator=(CLLKTrackerStage1Kernel &&) = default;
+    /** Initialise the kernel input and output
+     *
+     * @param[in]      new_input           Pointer to the input new tensor. Data types supported: U8
+     * @param[in, out] new_points_internal Pointer to the array of CLLKInternalKeypoint for new points
+     * @param[in]      coeff_table         Pointer to the array holding the Spatial Gradient coefficients
+     * @param[in]      old_ival            Pointer to the array holding internal values
+     * @param[in]      termination         The criteria to terminate the search of each keypoint.
+     * @param[in]      epsilon             The error for terminating the algorithm
+     * @param[in]      num_iterations      The maximum number of iterations before terminating the algorithm
+     * @param[in]      window_dimension    The size of the window on which to perform the algorithm
+     * @param[in]      level               The pyramid level
+     */
+    void configure(const ICLTensor *new_input, ICLLKInternalKeypointArray *new_points_internal, ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
+                   Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, size_t level);
+    /** Initialise the kernel input and output
+     *
+     * @param[in]      compile_context     The compile context to be used.
+     * @param[in]      new_input           Pointer to the input new tensor. Data types supported: U8
+     * @param[in, out] new_points_internal Pointer to the array of CLLKInternalKeypoint for new points
+     * @param[in]      coeff_table         Pointer to the array holding the Spatial Gradient coefficients
+     * @param[in]      old_ival            Pointer to the array holding internal values
+     * @param[in]      termination         The criteria to terminate the search of each keypoint.
+     * @param[in]      epsilon             The error for terminating the algorithm
+     * @param[in]      num_iterations      The maximum number of iterations before terminating the algorithm
+     * @param[in]      window_dimension    The size of the window on which to perform the algorithm
+     * @param[in]      level               The pyramid level
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *new_input, ICLLKInternalKeypointArray *new_points_internal, ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
+                   Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, size_t level);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_new_input;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLLKTRACKERKERNEL_H */

diff --git a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
index 6e4c45e..49e04c3 100644
--- a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
+#include "src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h
new file mode 100644
index 0000000..5d0a22a
--- /dev/null
+++ b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h

@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLLOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H
+#define ARM_COMPUTE_CLLOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to multiply each row of first tensor with low 2 dimensions of second tensor.
+ *
+ * @attention The second input tensor must have at least 2 dimensions (matrix)
+ *
+ */
+class CLLocallyConnectedMatrixMultiplyKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLLocallyConnectedMatrixMultiplyKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLocallyConnectedMatrixMultiplyKernel(const CLLocallyConnectedMatrixMultiplyKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLocallyConnectedMatrixMultiplyKernel &operator=(const CLLocallyConnectedMatrixMultiplyKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLLocallyConnectedMatrixMultiplyKernel(CLLocallyConnectedMatrixMultiplyKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLLocallyConnectedMatrixMultiplyKernel &operator=(CLLocallyConnectedMatrixMultiplyKernel &&) = default;
+    /** Initialise the kernel's input, output and alpha
+     *
+     * @param[in]  input0 First input tensor. Data types supported: F32
+     * @param[in]  input1 Second input tensor. Data type supported: same as @p input0
+     * @param[out] output Output tensor to store the result. Data type supported: same as @p input0
+     */
+    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output);
+    /** Initialise the kernel's input, output and alpha
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input0          First input tensor. Data types supported: F32
+     * @param[in]  input1          Second input tensor. Data type supported: same as @p input0
+     * @param[out] output          Output tensor to store the result. Data type supported: same as @p input0
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLLocallyConnectedMatrixMultiplyKernel
+     *
+     * @param[in] input0 First input tensor info. Data types supported: F32
+     * @param[in] input1 Second input tensor info. Data type supported: same as @p input0
+     * @param[in] output Output tensor info. Data type supported: same as @p input0
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input0;
+    const ICLTensor *_input1;
+    ICLTensor       *_output;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLLOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H */

diff --git a/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp b/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp
index dc130d0..9845dd6 100644
--- a/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp
+++ b/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
+#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLMagnitudePhaseKernel.h b/src/core/CL/kernels/CLMagnitudePhaseKernel.h
new file mode 100644
index 0000000..514036b
--- /dev/null
+++ b/src/core/CL/kernels/CLMagnitudePhaseKernel.h

@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLMAGNITUDEPHASEKERNEL_H
+#define ARM_COMPUTE_CLMAGNITUDEPHASEKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Template interface for the kernel to compute magnitude and phase.
+ *
+ */
+class CLMagnitudePhaseKernel : public ICLKernel
+{
+public:
+    /** Default constructor. */
+    CLMagnitudePhaseKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMagnitudePhaseKernel(const CLMagnitudePhaseKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMagnitudePhaseKernel &operator=(const CLMagnitudePhaseKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLMagnitudePhaseKernel(CLMagnitudePhaseKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLMagnitudePhaseKernel &operator=(CLMagnitudePhaseKernel &&) = default;
+    /** Initialise the kernel's input, output.
+     *
+     * @note At least one of output1 or output2 must be set.
+     *
+     * @param[in]  gx         The input gradient X tensor. Data types supported: S16/S32.
+     * @param[in]  gy         The input gradient Y tensor. Data types supported: S16/S32.
+     * @param[out] magnitude  (Optional) The output tensor - Magnitude. Data types supported: S16/S32.
+     * @param[out] phase      (Optional) The output tensor - Phase. Data types supported: U8.
+     * @param[in]  mag_type   (Optional) Magnitude calculation type. Default: L2NORM.
+     * @param[in]  phase_type (Optional) Phase calculation type. Default: SIGNED.
+     */
+    void configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase,
+                   MagnitudeType mag_type = MagnitudeType::L2NORM, PhaseType phase_type = PhaseType::SIGNED);
+    /** Initialise the kernel's input, output.
+     *
+     * @note At least one of output1 or output2 must be set.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  gx              The input gradient X tensor. Data types supported: S16/S32.
+     * @param[in]  gy              The input gradient Y tensor. Data types supported: S16/S32.
+     * @param[out] magnitude       (Optional) The output tensor - Magnitude. Data types supported: S16/S32.
+     * @param[out] phase           (Optional) The output tensor - Phase. Data types supported: U8.
+     * @param[in]  mag_type        (Optional) Magnitude calculation type. Default: L2NORM.
+     * @param[in]  phase_type      (Optional) Phase calculation type. Default: SIGNED.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase,
+                   MagnitudeType mag_type = MagnitudeType::L2NORM, PhaseType phase_type = PhaseType::SIGNED);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_gx;        /**< Input gradient X. */
+    const ICLTensor *_gy;        /**< Input gradient Y. */
+    ICLTensor       *_magnitude; /**< Output - Magnitude. */
+    ICLTensor       *_phase;     /**< Output - Phase. */
+    bool             _run_mag;   /**< Calculate magnitude ? */
+    bool             _run_phase; /**< Calculate phase ? */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLMAGNITUDEPHASEKERNEL_H */

diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
index a78996d..2a1312a 100644
--- a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
+#include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
new file mode 100644
index 0000000..86267ec
--- /dev/null
+++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h

@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLMAXUNPOOLINGLAYERKERNEL_H
+#define ARM_COMPUTE_CLMAXUNPOOLINGLAYERKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the pooling layer kernel */
+class CLMaxUnpoolingLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLMaxUnpoolingLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMaxUnpoolingLayerKernel(const CLMaxUnpoolingLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMaxUnpoolingLayerKernel &operator=(const CLMaxUnpoolingLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLMaxUnpoolingLayerKernel(CLMaxUnpoolingLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLMaxUnpoolingLayerKernel &operator=(CLMaxUnpoolingLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLMaxUnpoolingLayerKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @note Output shape must be equal to the shape of the original input to pool.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  indices         Tensor containing the offset to store the input elements in the output tensor.
+     *                             @ref CLPoolingLayerKernel with indices should precede this function in order to
+     *                             properly reconstruct the output tensor.
+     *                             The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
+     * @param[out] output          Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  pool_info       Contains pooling operation information described in @ref PoolingLayerInfo.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLMaxUnpoolingLayerKernel
+     *
+     * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
+     * @param[in] indices   TensorInfo associated to the tensor containing the offset to store the input elements in the output tensor.
+     *                      @ref CLPoolingLayerKernel with indices should precede this function in order to
+     *                      properly reconstruct the output tensor.
+     *                      The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
+     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
+
+    // Inherited methods overridden
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    const ICLTensor *_indices;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLMAXUNPOOLINGLAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLMeanStdDevKernel.cpp b/src/core/CL/kernels/CLMeanStdDevKernel.cpp
index 5acc3ac..aed6e6e 100644
--- a/src/core/CL/kernels/CLMeanStdDevKernel.cpp
+++ b/src/core/CL/kernels/CLMeanStdDevKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
+#include "src/core/CL/kernels/CLMeanStdDevKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLMeanStdDevKernel.h b/src/core/CL/kernels/CLMeanStdDevKernel.h
new file mode 100644
index 0000000..179a202
--- /dev/null
+++ b/src/core/CL/kernels/CLMeanStdDevKernel.h

@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLMEANSTDDEVKERNEL_H
+#define ARM_COMPUTE_CLMEANSTDDEVKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace cl
+{
+class Buffer;
+}
+
+namespace arm_compute
+{
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Interface for the kernel to calculate mean and standard deviation of input image pixels. */
+class CLMeanStdDevKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLMeanStdDevKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMeanStdDevKernel(const CLMeanStdDevKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMeanStdDevKernel &operator=(const CLMeanStdDevKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLMeanStdDevKernel(CLMeanStdDevKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLMeanStdDevKernel &operator=(CLMeanStdDevKernel &&) = default;
+    /** Initialise the kernel's input and outputs.
+     *
+     * @param[in]  input              Input image. Data types supported: U8.
+     * @param[out] mean               Input average pixel value.
+     * @param[out] global_sum         Keeps global sum of pixel values (Buffer size: 1 cl_ulong).
+     * @param[out] stddev             (Optional) Output standard deviation of pixel values.
+     * @param[out] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values (Buffer size: 1 cl_ulong).
+     */
+    void configure(const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev = nullptr, cl::Buffer *global_sum_squared = nullptr);
+    /** Initialise the kernel's input and outputs.
+     *
+     * @param[in]  compile_context    The compile context to be used.
+     * @param[in]  input              Input image. Data types supported: U8.
+     * @param[out] mean               Input average pixel value.
+     * @param[out] global_sum         Keeps global sum of pixel values (Buffer size: 1 cl_ulong).
+     * @param[out] stddev             (Optional) Output standard deviation of pixel values.
+     * @param[out] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values (Buffer size: 1 cl_ulong).
+     */
+    void configure(const CLCompileContext &compile_context, const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev = nullptr, cl::Buffer *global_sum_squared = nullptr);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDevKernel.
+     *
+     * @param[in] input              Input image info. Data types supported: U8.
+     * @param[in] mean               Input average pixel value.
+     * @param[in] global_sum         Keeps global sum of pixel values.
+     * @param[in] stddev             (Optional) Output standard deviation of pixel values.
+     * @param[in] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, float *mean, cl::Buffer *global_sum, float *stddev = nullptr, cl::Buffer *global_sum_squared = nullptr);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+    BorderSize border_size() const override;
+
+private:
+    const ICLImage *_input;
+    float          *_mean;
+    float          *_stddev;
+    cl::Buffer     *_global_sum;
+    cl::Buffer     *_global_sum_squared;
+    BorderSize      _border_size;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLMEANSTDDEVKERNEL_H */

diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
index 82a22a9..a889df7 100644
--- a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
+#include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
new file mode 100644
index 0000000..a1ba2b9
--- /dev/null
+++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h

@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLMEANSTDDEVNORMALIZATIONKERNEL_H
+#define ARM_COMPUTE_CLMEANSTDDEVNORMALIZATIONKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to normalize the input 2D tensor across the first dimension with respect to mean and standard deviation of the same dimension. */
+class CLMeanStdDevNormalizationKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLMeanStdDevNormalizationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMeanStdDevNormalizationKernel(const CLMeanStdDevNormalizationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMeanStdDevNormalizationKernel &operator=(const CLMeanStdDevNormalizationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLMeanStdDevNormalizationKernel(CLMeanStdDevNormalizationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLMeanStdDevNormalizationKernel &operator=(CLMeanStdDevNormalizationKernel &&) = default;
+    /** Default destructor */
+    ~CLMeanStdDevNormalizationKernel() = default;
+    /** Initialise the kernel's input and outputs.
+     *
+     * @note If the output tensor is a nullptr, the normalization will be performed in-place.
+     *
+     * @param[in, out] input   Source tensor with 2 dimensions. In case of @p output tensor = nullptr,
+     *                         this tensor will store the result of the normalization. Data types supported: F16/F32.
+     * @param[out]     output  (Optional) Destination tensor. It can be nullptr in case of in-place computation. Data type supported: same as @p input
+     * @param[in]      epsilon (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
+     */
+    void configure(ICLTensor *input, ICLTensor *output = nullptr, float epsilon = 1e-8f);
+    /** Initialise the kernel's input and outputs.
+     *
+     * @note If the output tensor is a nullptr, the normalization will be performed in-place.
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] input           Source tensor with 2 dimensions. In case of @p output tensor = nullptr,
+     *                                 this tensor will store the result of the normalization. Data types supported: F16/F32.
+     * @param[out]     output          (Optional) Destination tensor. It can be nullptr in case of in-place computation. Data type supported: same as @p input
+     * @param[in]      epsilon         (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
+     */
+    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output = nullptr, float epsilon = 1e-8f);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDevNormalizationKernel
+     *
+     * @param[in] input   Source tensor info with 2 dimensions. In case of @p output tensor info = nullptr,
+     *                    this tensor will store the result of the normalization. Data types supported: F16/F32.
+     * @param[in] output  (Optional) Destination tensor info. It can be nullptr in case of in-place computation. Data type supported: same as @p input
+     * @param[in] epsilon (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output = nullptr, float epsilon = 1e-8f);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    ICLTensor *_input;
+    ICLTensor *_output;
+    bool       _run_in_place;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLMEANSTDDEVNORMALIZATIONKERNEL_H */

diff --git a/src/core/CL/kernels/CLMedian3x3Kernel.cpp b/src/core/CL/kernels/CLMedian3x3Kernel.cpp
index 4b89950..23a21d6 100644
--- a/src/core/CL/kernels/CLMedian3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLMedian3x3Kernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
+#include "src/core/CL/kernels/CLMedian3x3Kernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLMedian3x3Kernel.h b/src/core/CL/kernels/CLMedian3x3Kernel.h
new file mode 100644
index 0000000..8cc5ed7
--- /dev/null
+++ b/src/core/CL/kernels/CLMedian3x3Kernel.h

@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLMEDIAN3X3KERNEL_H
+#define ARM_COMPUTE_CLMEDIAN3X3KERNEL_H
+
+#include "src/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the median 3x3 filter kernel.
+ *
+ */
+class CLMedian3x3Kernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input            An input tensor. Data types supported: U8
+     * @param[out] output           The output tensor. Data types supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  input            An input tensor. Data types supported: U8
+     * @param[out] output           The output tensor. Data types supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLMEDIAN3X3KERNEL_H */

diff --git a/src/core/CL/kernels/CLMemsetKernel.cpp b/src/core/CL/kernels/CLMemsetKernel.cpp
index 186ed2a..2543b07 100644
--- a/src/core/CL/kernels/CLMemsetKernel.cpp
+++ b/src/core/CL/kernels/CLMemsetKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
+#include "src/core/CL/kernels/CLMemsetKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/helpers/WindowHelpers.h"

diff --git a/src/core/CL/kernels/CLMemsetKernel.h b/src/core/CL/kernels/CLMemsetKernel.h
new file mode 100644
index 0000000..dc103f5
--- /dev/null
+++ b/src/core/CL/kernels/CLMemsetKernel.h

@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLMEMSETKERNEL_H
+#define ARM_COMPUTE_CLMEMSETKERNEL_H
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for filling the planes of a tensor */
+class CLMemsetKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLMemsetKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMemsetKernel(const CLMemsetKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMemsetKernel &operator=(const CLMemsetKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLMemsetKernel(CLMemsetKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLMemsetKernel &operator=(CLMemsetKernel &&) = default;
+    /** Default destructor */
+    ~CLMemsetKernel() = default;
+
+    /** Initialise the kernel's tensor and filling value
+     *
+     * @param[in,out] tensor         Input tensor to fill. Supported data types: All.
+     * @param[in]     constant_value The value used to fill the planes of the tensor
+     * @param[in]     window         Window to be used in case setting only part of a tensor. Default is nullptr.
+     */
+    void configure(ICLTensor *tensor, const PixelValue &constant_value, Window *window = nullptr);
+    /** Initialise the kernel's tensor and filling value
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in,out] tensor          Input tensor to fill. Supported data types: All.
+     * @param[in]     constant_value  The value used to fill the planes of the tensor
+     * @param[in]     window          Window to be used in case setting only part of a tensor. Default is nullptr.
+     */
+    void configure(const CLCompileContext &compile_context, ICLTensor *tensor, const PixelValue &constant_value, Window *window = nullptr);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLMemsetKernel
+     *
+     * @param[in] tensor         Source tensor info. Data types supported: All.
+     * @param[in] constant_value The value used to fill the planes of the tensor
+     * @param[in] window         Window to be used in case setting only part of a tensor. Default is nullptr.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    ICLTensor *_tensor;
+    Window     _full_window;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLMEMSETRKERNEL_H */

diff --git a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
index bf645f8..7017efa 100644
--- a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h"
+#include "src/core/CL/kernels/CLMinMaxLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLMinMaxLayerKernel.h b/src/core/CL/kernels/CLMinMaxLayerKernel.h
new file mode 100644
index 0000000..aa2ff3f
--- /dev/null
+++ b/src/core/CL/kernels/CLMinMaxLayerKernel.h

@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLMINMAXLAYERKERNEL_H
+#define ARM_COMPUTE_CLMINMAXLAYERKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to perform min max search on a 3D tensor.
+ */
+class CLMinMaxLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLMinMaxLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMinMaxLayerKernel(const CLMinMaxLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMinMaxLayerKernel &operator=(const CLMinMaxLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLMinMaxLayerKernel(CLMinMaxLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLMinMaxLayerKernel &operator=(CLMinMaxLayerKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches.Data types supported: F32.
+     * @param[out] output Output tensor with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor.
+     *                    The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches.Data types supported: F32.
+     * @param[out] output          Output tensor with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor.
+     *                    The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLMinMaxLayerKernel
+     *
+     * @param[in] input  Input tensor info.  Data types supported: F32.
+     * @param[in] output Output tensor info with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor.
+     *                   The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    /** Resets global minimum and maximum
+     *
+     * @param[in,out] queue Command queue on which to map and unmap the min_max tensor
+     */
+    void reset(cl::CommandQueue &queue);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLMINMAXLAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
index 634b580..675cfc1 100644
--- a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
+++ b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h"
+#include "src/core/CL/kernels/CLMinMaxLocationKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLMinMaxLocationKernel.h b/src/core/CL/kernels/CLMinMaxLocationKernel.h
new file mode 100644
index 0000000..2196abe
--- /dev/null
+++ b/src/core/CL/kernels/CLMinMaxLocationKernel.h

@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLMINMAXLOCATIONKERNEL_H
+#define ARM_COMPUTE_CLMINMAXLOCATIONKERNEL_H
+
+#include "arm_compute/core/CL/ICLArray.h"
+#include "src/core/CL/ICLKernel.h"
+
+#include <array>
+
+namespace arm_compute
+{
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Interface for the kernel to perform min max search on an image.
+ */
+class CLMinMaxKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLMinMaxKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMinMaxKernel(const CLMinMaxKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMinMaxKernel &operator=(const CLMinMaxKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLMinMaxKernel(CLMinMaxKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLMinMaxKernel &operator=(CLMinMaxKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input   Input Image. Data types supported: U8/S16/F32.
+     * @param[out] min_max Buffer of 2 elements to store the min value at position 0 and the max value at position 1. Data type supported: S32 if input type is U8/S16, F32 if input type is F32.
+     */
+    void configure(const ICLImage *input, cl::Buffer *min_max);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input Image. Data types supported: U8/S16/F32.
+     * @param[out] min_max         Buffer of 2 elements to store the min value at position 0 and the max value at position 1. Data type supported: S32 if input type is U8/S16, F32 if input type is F32.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLImage *input, cl::Buffer *min_max);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;               /**< Input image. */
+    cl::Buffer      *_min_max;             /**< Minimum/maximum value. */
+    std::array<int, 2> _data_type_max_min; /**< Maximum and minimum data type value respectively. */
+};
+
+/** Interface for the kernel to find min max locations of an image.
+ */
+class CLMinMaxLocationKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLMinMaxLocationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMinMaxLocationKernel(const CLMinMaxLocationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMinMaxLocationKernel &operator=(const CLMinMaxLocationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLMinMaxLocationKernel(CLMinMaxLocationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLMinMaxLocationKernel &operator=(CLMinMaxLocationKernel &&) = default;
+    /** Initialise the kernel's input and outputs.
+     *
+     * @note When locations of min and max occurrences are requested, the reported number of locations is limited to the given array size.
+     *
+     * @param[in]  input         Input image. Data types supported: U8/S16/F32.
+     * @param[out] min_max       Buffer of 2 elements to store the min value at position 0 and the max value at position 1. Data type supported: S32 if input type is U8/S16, F32 if input type is F32.
+     * @param[out] min_max_count Buffer of 2 elements to store the min value occurrences at position 0 and the max value occurrences at position 1. Data type supported: S32
+     * @param[out] min_loc       (Optional) Array of Coordinates2D used to store minimum value locations.
+     * @param[out] max_loc       (Optional) Array of Coordinates2D used to store maximum value locations.
+     */
+    void configure(const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count,
+                   ICLCoordinates2DArray *min_loc = nullptr, ICLCoordinates2DArray *max_loc = nullptr);
+    /** Initialise the kernel's input and outputs.
+     *
+     * @note When locations of min and max occurrences are requested, the reported number of locations is limited to the given array size.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input image. Data types supported: U8/S16/F32.
+     * @param[out] min_max         Buffer of 2 elements to store the min value at position 0 and the max value at position 1. Data type supported: S32 if input type is U8/S16, F32 if input type is F32.
+     * @param[out] min_max_count   Buffer of 2 elements to store the min value occurrences at position 0 and the max value occurrences at position 1. Data type supported: S32
+     * @param[out] min_loc         (Optional) Array of Coordinates2D used to store minimum value locations.
+     * @param[out] max_loc         (Optional) Array of Coordinates2D used to store maximum value locations.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count,
+                   ICLCoordinates2DArray *min_loc = nullptr, ICLCoordinates2DArray *max_loc = nullptr);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLImage *_input;         /**< Input image. */
+    cl::Buffer     *_min_max_count; /**< Minimum/maximum value occurrences. */
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLMINMAXLOCATIONKERNEL_H */

diff --git a/src/core/CL/kernels/CLNonLinearFilterKernel.cpp b/src/core/CL/kernels/CLNonLinearFilterKernel.cpp
index 0a8472b..c73acaf 100644
--- a/src/core/CL/kernels/CLNonLinearFilterKernel.cpp
+++ b/src/core/CL/kernels/CLNonLinearFilterKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h"
+#include "src/core/CL/kernels/CLNonLinearFilterKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLNonLinearFilterKernel.h b/src/core/CL/kernels/CLNonLinearFilterKernel.h
new file mode 100644
index 0000000..ed42063
--- /dev/null
+++ b/src/core/CL/kernels/CLNonLinearFilterKernel.h

@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLNONLINEARFILTERKERNEL_H
+#define ARM_COMPUTE_CLNONLINEARFILTERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLSimple2DKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to apply a non-linear filter */
+class CLNonLinearFilterKernel : public ICLSimple2DKernel
+{
+public:
+    /** Default constructor */
+    CLNonLinearFilterKernel();
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8
+     * @param[out] output           Destination tensor. Data types supported: U8
+     * @param[in]  function         Non linear function to perform
+     * @param[in]  mask_size        Mask size. Supported sizes: 3, 5
+     * @param[in]  pattern          Mask pattern
+     * @param[in]  mask             The given mask. Will be used only if pattern is specified to PATTERN_OTHER
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function,
+                   unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
+                   bool border_undefined);
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  input            Source tensor. Data types supported: U8
+     * @param[out] output           Destination tensor. Data types supported: U8
+     * @param[in]  function         Non linear function to perform
+     * @param[in]  mask_size        Mask size. Supported sizes: 3, 5
+     * @param[in]  pattern          Mask pattern
+     * @param[in]  mask             The given mask. Will be used only if pattern is specified to PATTERN_OTHER
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function,
+                   unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
+                   bool border_undefined);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+
+private:
+    BorderSize _border_size; /**< Border size */
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLNONLINEARFILTERKERNEL_H */

diff --git a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp
index 9c6d44b..7d5c5ba 100644
--- a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
+#include "src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h
new file mode 100644
index 0000000..d9ed60c
--- /dev/null
+++ b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h

@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLNONMAXIMASUPPRESSION3x3KERNEL_H
+#define ARM_COMPUTE_CLNONMAXIMASUPPRESSION3x3KERNEL_H
+
+#include "src/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface to perform Non-Maxima suppression over a 3x3 window using OpenCL
+ *
+ * @note Used by @ref CLFastCorners and @ref CLHarrisCorners
+ */
+class CLNonMaximaSuppression3x3Kernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's sources, destinations and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8, F32. (Must be the same as the output tensor)
+     * @param[out] output           Destination tensor. Data types supported: U8, F32. (Must be the same as the input tensor)
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+    /** Initialise the kernel's sources, destinations and border mode.
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  input            Source tensor. Data types supported: U8, F32. (Must be the same as the output tensor)
+     * @param[out] output           Destination tensor. Data types supported: U8, F32. (Must be the same as the input tensor)
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLNONMAXIMASUPPRESSION3x3KERNEL_H */

diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index 686e6f1..d1982e7 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h"
+#include "src/core/CL/kernels/CLNormalizationLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.h b/src/core/CL/kernels/CLNormalizationLayerKernel.h
new file mode 100644
index 0000000..739a2ae
--- /dev/null
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.h

@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLNORMALIZATIONLAYERKERNEL_H
+#define ARM_COMPUTE_CLNORMALIZATIONLAYERKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the normalization layer kernel.
+ */
+class CLNormalizationLayerKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLNormalizationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLNormalizationLayerKernel(const CLNormalizationLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLNormalizationLayerKernel &operator=(const CLNormalizationLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    CLNormalizationLayerKernel(CLNormalizationLayerKernel &&) = default;
+    /** Default move assignment operator */
+    CLNormalizationLayerKernel &operator=(CLNormalizationLayerKernel &&) = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                       and an optional 4th dimension for batch of inputs. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
+     * @param[out] output    Destination tensor. Output will have the same number of dimensions as input. Data types supported: same as @p input.
+     *                       Data layouts supported: same as @p input.
+     * @param[in]  norm_info Normalization layer information like the normalization type, normalization size and other parameters.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info);
+    /** Set the input and output tensors.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                             and an optional 4th dimension for batch of inputs. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
+     * @param[out] output          Destination tensor. Output will have the same number of dimensions as input. Data types supported: same as @p input.
+     *                             Data layouts supported: same as @p input.
+     * @param[in]  norm_info       Normalization layer information like the normalization type, normalization size and other parameters.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayerKernel
+     *
+     * @param[in] input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                      and an optional 4th dimension for batch of inputs. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
+     * @param[in] output    Destination tensor. Output will have the same number of dimensions as input. Data types supported: same as @p input.
+     *                      Data layouts supported: same as @p input.
+     * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    BorderSize       _border_size;
+    bool             _is_norm_across_width;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLNORMALIZATIONLAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
index 407ce66..18cbe21 100644
--- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
+#include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
new file mode 100644
index 0000000..6db4433
--- /dev/null
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h

@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLNORMALIZEPLANARYUVLAYERKERNEL_H
+#define ARM_COMPUTE_CLNORMALIZEPLANARYUVLAYERKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the NormalizePlanarYUV layer kernel. */
+class CLNormalizePlanarYUVLayerKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLNormalizePlanarYUVLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLNormalizePlanarYUVLayerKernel(const CLNormalizePlanarYUVLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLNormalizePlanarYUVLayerKernel &operator=(const CLNormalizePlanarYUVLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    CLNormalizePlanarYUVLayerKernel(CLNormalizePlanarYUVLayerKernel &&) = default;
+    /** Default move assignment operator */
+    CLNormalizePlanarYUVLayerKernel &operator=(CLNormalizePlanarYUVLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLNormalizePlanarYUVLayerKernel() = default;
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, channels].
+     *                    Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out] output Destination tensor. Data type supported: same as @p input
+     * @param[in]  mean   Mean values tensor. 1 dimension with size equal to the number of input channels. Data types supported: same as @p input
+     * @param[in]  std    Standard deviation values tensor. 1 dimension with size equal to the number of input channels.
+     *                    Data types supported: same as @p input
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std);
+    /** Set the input and output tensors.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, channels].
+     *                             Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out] output          Destination tensor. Data type supported: same as @p input
+     * @param[in]  mean            Mean values tensor. 1 dimension with size equal to the number of input channels. Data types supported: same as @p input
+     * @param[in]  std             Standard deviation values tensor. 1 dimension with size equal to the number of input channels.
+     *                             Data types supported: same as @p input
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizePlanarYUVLayerKernel
+     *
+     * @param[in]  input  Source tensor info. 3 lower dimensions represent a single input with dimensions [width, height, channels].
+     *                    Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out] output Destination tensor info. Data type supported: same as @p input
+     * @param[in]  mean   Mean values tensor info. 1 dimension with size equal to the number of input channels. Data types supported: same as @p input
+     * @param[in]  std    Standard deviation values tensor info. 1 dimension with size equal to the number of input channels.
+     *                    Data types supported: same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    const ICLTensor *_mean;
+    const ICLTensor *_std;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLNORMALIZEPLANARYUVLAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLPadLayerKernel.cpp b/src/core/CL/kernels/CLPadLayerKernel.cpp
index 4572973..4856766 100644
--- a/src/core/CL/kernels/CLPadLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPadLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
+#include "src/core/CL/kernels/CLPadLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLPadLayerKernel.h b/src/core/CL/kernels/CLPadLayerKernel.h
new file mode 100644
index 0000000..2b0abb1
--- /dev/null
+++ b/src/core/CL/kernels/CLPadLayerKernel.h

@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLPADLAYERKERNEL_H
+#define ARM_COMPUTE_CLPADLAYERKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the PadLayer function. */
+class CLPadLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLPadLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPadLayerKernel(const CLPadLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPadLayerKernel &operator=(const CLPadLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLPadLayerKernel(CLPadLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLPadLayerKernel &operator=(CLPadLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLPadLayerKernel() = default;
+    /** Set the input and output tensor.
+     *
+     * @param[in]  input          Source tensor. Data types supported: All.
+     * @param[out] output         Output tensor. Data type supported: same as @p input
+     * @param[in]  padding        The padding for each spatial dimension of the input tensor. The pair padding[i]
+     *                            specifies the front and the end padding in the i-th dimension.
+     * @param[in]  constant_value (Optional) Constant value to be used for the padding.
+     * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
+     *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
+    /** Set the input and output tensor.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: All.
+     * @param[out] output          Output tensor. Data type supported: same as @p input
+     * @param[in]  padding         The padding for each spatial dimension of the input tensor. The pair padding[i]
+     *                             specifies the front and the end padding in the i-th dimension.
+     * @param[in]  constant_value  (Optional) Constant value to be used for the padding.
+     * @param[in]  mode            (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
+     *                             or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(),
+                   PaddingMode mode = PaddingMode::CONSTANT);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLPadLayerKernel
+     *
+     * @param[in] input          Source tensor info. Data types supported: All.
+     * @param[in] output         Output tensor info. Data type supported: same as @p input
+     * @param[in] padding        The padding for each spatial dimension of the input tensor. The pair padding[i]
+     *                           specifies the front and the end padding in the i-th dimension.
+     * @param[in] constant_value (Optional) Constant value to be used for the padding.
+     * @param[in] mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
+     *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    int              _input_start_x;
+    int              _input_start_y;
+    bool             _4d_enabled;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLPADLAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLPermuteKernel.cpp b/src/core/CL/kernels/CLPermuteKernel.cpp
index 6206657..4d289f2 100644
--- a/src/core/CL/kernels/CLPermuteKernel.cpp
+++ b/src/core/CL/kernels/CLPermuteKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLPermuteKernel.h"
+#include "src/core/CL/kernels/CLPermuteKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/helpers/AutoConfiguration.h"

diff --git a/src/core/CL/kernels/CLPermuteKernel.h b/src/core/CL/kernels/CLPermuteKernel.h
new file mode 100644
index 0000000..d1bb875
--- /dev/null
+++ b/src/core/CL/kernels/CLPermuteKernel.h

@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLPERMUTEKERNEL_H
+#define ARM_COMPUTE_CLPERMUTEKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform tensor permutation.
+ *
+ * Permutes given a permutation vector
+ */
+class CLPermuteKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLPermuteKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPermuteKernel(const CLPermuteKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPermuteKernel &operator=(const CLPermuteKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLPermuteKernel(CLPermuteKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLPermuteKernel &operator=(CLPermuteKernel &&) = default;
+    /** Set the input and output of the kernel.
+     *
+     * @note Arbitrary permutation vectors are supported with rank not greater than 4
+     *
+     * @param[in] input  The input tensor to permute. Data types supported: All.
+     * @param[in] output The output tensor. Data types supported: Same as @p input
+     * @param[in] perm   Permutation vector
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm);
+    /** Set the input and output of the kernel.
+     *
+     * @note Arbitrary permutation vectors are supported with rank not greater than 4
+     *
+     * @param[in] compile_context The compile context to be used.
+     * @param[in] input           The input tensor to permute. Data types supported: All.
+     * @param[in] output          The output tensor. Data types supported: Same as @p input
+     * @param[in] perm            Permutation vector
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PermutationVector &perm);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLPermuteKernel
+     *
+     * @note Arbitrary permutation vectors are supported with rank not greater than 4
+     *
+     * @param[in] input  First tensor input info. Data types supported: All.
+     * @param[in] output Output tensor info. Data types supported: same as @p input.
+     * @param[in] perm   Permutation vector
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor        *_output;
+    PermutationVector _perm;
+};
+} // arm_compute
+#endif /*ARM_COMPUTE_CLPERMUTEKERNEL_H */

diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
index a7bd4da..a6255f8 100644
--- a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
+++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
+#include "src/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.h b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
new file mode 100644
index 0000000..0cc4005
--- /dev/null
+++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.h

@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H
+#define ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ICLTensor;
+
+/** Interface for the pixelwise multiplication kernel. */
+class CLPixelWiseMultiplicationKernel : public ICLKernel
+{
+public:
+    /** Default constructor.*/
+    CLPixelWiseMultiplicationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPixelWiseMultiplicationKernel(const CLPixelWiseMultiplicationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPixelWiseMultiplicationKernel &operator=(const CLPixelWiseMultiplicationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLPixelWiseMultiplicationKernel(CLPixelWiseMultiplicationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLPixelWiseMultiplicationKernel &operator=(CLPixelWiseMultiplicationKernel &&) = default;
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                         -> U8
+     *   - (U8,U8)                         -> S16
+     *   - (U8,S16)                        -> S16
+     *   - (S16,U8)                        -> S16
+     *   - (S16,S16)                       -> S16
+     *   - (F16,F16)                       -> F16
+     *   - (F32,F32)                       -> F32
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16
+     *   - (QSYMM16,QSYMM16)               -> S32
+     *
+     * @param[in]  input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in]  input2          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[out] output          The output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in]  scale           Scale to apply after multiplication.
+     *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+     * @param[in]  overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+     * @param[in]  rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
+     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale,
+                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                         -> U8
+     *   - (U8,U8)                         -> S16
+     *   - (U8,S16)                        -> S16
+     *   - (S16,U8)                        -> S16
+     *   - (S16,S16)                       -> S16
+     *   - (F16,F16)                       -> F16
+     *   - (F32,F32)                       -> F32
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16
+     *   - (QSYMM16,QSYMM16)               -> S32
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in]  input2          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[out] output          The output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in]  scale           Scale to apply after multiplication.
+     *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+     * @param[in]  overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+     * @param[in]  rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
+     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale,
+                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseMultiplicationKernel
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                         -> U8
+     *   - (U8,U8)                         -> S16
+     *   - (U8,S16)                        -> S16
+     *   - (S16,U8)                        -> S16
+     *   - (S16,S16)                       -> S16
+     *   - (F16,F16)                       -> F16
+     *   - (F32,F32)                       -> F32
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16
+     *   - (QSYMM16,QSYMM16)               -> S32
+     *
+     * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in] input2          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in] output          The output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in] scale           Scale to apply after multiplication.
+     *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+     * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+     * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
+     * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
+                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ITensorInfo *_input1;
+    const ITensorInfo *_input2;
+    ITensorInfo       *_output;
+};
+
+/** Interface for the complex pixelwise multiplication kernel. */
+class CLComplexPixelWiseMultiplicationKernel : public ICLKernel
+{
+public:
+    /** Default constructor.*/
+    CLComplexPixelWiseMultiplicationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLComplexPixelWiseMultiplicationKernel(const CLComplexPixelWiseMultiplicationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLComplexPixelWiseMultiplicationKernel &operator=(const CLComplexPixelWiseMultiplicationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLComplexPixelWiseMultiplicationKernel(CLComplexPixelWiseMultiplicationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLComplexPixelWiseMultiplicationKernel &operator=(CLComplexPixelWiseMultiplicationKernel &&) = default;
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input1   An input tensor info. Data types supported: F32. Number of channels supported: 2.
+     * @param[in]  input2   An input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+     * @param[out] output   The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+     * @param[in]  act_info (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input1          An input tensor info. Data types supported: F32. Number of channels supported: 2.
+     * @param[in]  input2          An input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+     * @param[out] output          The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CLComplexPixelWiseMultiplicationKernel
+     *
+     * @param[in] input1   An input tensor info. Data types supported: F32. Number of channels supported: 2.
+     * @param[in] input2   An input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+     * @param[in] output   The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ITensorInfo *_input1;
+    const ITensorInfo *_input2;
+    ITensorInfo       *_output;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H */

diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index 0570887..905610c 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp

@@ -21,11 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
+#include "src/core/CL/kernels/CLPoolingLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
@@ -34,6 +33,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
+#include "src/core/CL/ICLKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -176,7 +176,7 @@
         case DataLayout::NHWC:
         {
             // Initialize border size
-            border_size = BorderSize();
+            border_size                       = BorderSize();
             num_elems_processed_per_iteration = adjust_vec_size(4, output->dimension(0));
             win                               = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
 

diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.h b/src/core/CL/kernels/CLPoolingLayerKernel.h
new file mode 100644
index 0000000..d88402a
--- /dev/null
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.h

@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLPOOLINGLAYERKERNEL_H
+#define ARM_COMPUTE_CLPOOLINGLAYERKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+#include "arm_compute/core/Error.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the pooling layer kernel */
+class CLPoolingLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLPoolingLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPoolingLayerKernel(const CLPoolingLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPoolingLayerKernel &operator=(const CLPoolingLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLPoolingLayerKernel(CLPoolingLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLPoolingLayerKernel &operator=(CLPoolingLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLPoolingLayerKernel() = default;
+
+    /** Set the input and output tensors.
+     *
+     *
+     * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[out] indices   (optional) The indices of the maximal values. Data type supported: U32.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices = nullptr);
+    /** Set the input and output tensors.
+     *
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out] output          Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  pool_info       Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[out] indices         (optional) The indices of the maximal values. Data type supported: U32.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices = nullptr);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLPoolingLayerKernel
+     *
+     * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
+     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[in] indices   (optional) The indices of the maximal values. Data type supported: U32.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+public:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    ICLTensor       *_indices;
+    PoolingLayerInfo _pool_info;
+    DataLayout       _data_layout;
+    BorderSize       _border_size;
+    unsigned int     _num_elems_processed_per_iteration;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLPOOLINGLAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
index 202e9fb..7b9caf0 100644
--- a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h"
+#include "src/core/CL/kernels/CLPriorBoxLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.h b/src/core/CL/kernels/CLPriorBoxLayerKernel.h
new file mode 100644
index 0000000..6c369a7
--- /dev/null
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.h

@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLPRIORBOXLAYERKERNEL_H
+#define ARM_COMPUTE_CLPRIORBOXLAYERKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the PriorBox layer kernel. */
+class CLPriorBoxLayerKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLPriorBoxLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPriorBoxLayerKernel(const CLPriorBoxLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPriorBoxLayerKernel &operator=(const CLPriorBoxLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    CLPriorBoxLayerKernel(CLPriorBoxLayerKernel &&) = default;
+    /** Default move assignment operator */
+    CLPriorBoxLayerKernel &operator=(CLPriorBoxLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLPriorBoxLayerKernel() = default;
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input1        First source tensor. Data types supported: F32. Data layouts supported: NCHW/NHWC.
+     * @param[in]  input2        Second source tensor. Data types and layouts supported: same as @p input1
+     * @param[out] output        Destination tensor. Output dimensions are [W * H * num_priors * 4, 2]. Data types and layouts supported: same as @p input1
+     * @param[in]  info          Prior box layer info.
+     * @param[in]  min           Minimum prior box values
+     * @param[in]  max           Maximum prior box values
+     * @param[in]  aspect_ratios Aspect ratio values
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, cl::Buffer *aspect_ratios);
+    /** Set the input and output tensors.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input1          First source tensor. Data types supported: F32. Data layouts supported: NCHW/NHWC.
+     * @param[in]  input2          Second source tensor. Data types and layouts supported: same as @p input1
+     * @param[out] output          Destination tensor. Output dimensions are [W * H * num_priors * 4, 2]. Data types and layouts supported: same as @p input1
+     * @param[in]  info            Prior box layer info.
+     * @param[in]  min             Minimum prior box values
+     * @param[in]  max             Maximum prior box values
+     * @param[in]  aspect_ratios   Aspect ratio values
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max,
+                   cl::Buffer *aspect_ratios);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLPriorBoxLayerKernel
+     *
+     * @param[in] input1 First source tensor info. Data types supported: F32. Data layouts supported: NCHW/NHWC.
+     * @param[in] input2 Second source tensor info. Data types and layouts supported: same as @p input1
+     * @param[in] output Destination tensor info. Output dimensions are [W * H * num_priors * 4, 2]. Data type supported: same as @p input1
+     * @param[in] info   Prior box layer info.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input1;
+    const ICLTensor *_input2;
+    ICLTensor        *_output;
+    PriorBoxLayerInfo _info;
+    int               _num_priors;
+    cl::Buffer       *_min;
+    cl::Buffer       *_max;
+    cl::Buffer       *_aspect_ratios;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLPRIORBOXLAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
index ff6cc86..3a66d08 100644
--- a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
+#include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "src/core/helpers/AutoConfiguration.h"

diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
new file mode 100644
index 0000000..31085c3
--- /dev/null
+++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h

@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLQLSTMLAYERVNORMALIZATIONKERNEL_H
+#define ARM_COMPUTE_CLQLSTMLAYERVNORMALIZATIONKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to do layer normalization. */
+class CLQLSTMLayerNormalizationKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLQLSTMLayerNormalizationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLQLSTMLayerNormalizationKernel(const CLQLSTMLayerNormalizationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLQLSTMLayerNormalizationKernel &operator=(const CLQLSTMLayerNormalizationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLQLSTMLayerNormalizationKernel(CLQLSTMLayerNormalizationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLQLSTMLayerNormalizationKernel &operator=(CLQLSTMLayerNormalizationKernel &&) = default;
+    /** Default destructor */
+    ~CLQLSTMLayerNormalizationKernel() = default;
+    /** Initialise the kernel's input and outputs.
+     *
+     * @param[in]  input  Source tensor with 2 dimensions. Data types supported: QSYMM16.
+     * @param[out] output Destination tensor. Data type supported: same as @p input
+     * @param[in]  weight Weight tensor. Data types supported: Same as @p input.
+     * @param[in]  bias   Bias tensor. Data types supported: S32.
+     *
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias);
+    /** Initialise the kernel's input and outputs.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor with 2 dimensions. Data types supported: QSYMM16.
+     * @param[out] output          Destination tensor. Data type supported: same as @p input
+     * @param[in]  weight          Weight tensor. Data types supported: Same as @p input.
+     * @param[in]  bias            Bias tensor. Data types supported: S32.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLQLSTMLayerNormalizationKernel
+     *
+     * @param[in] input  Source tensor info with 2 dimensions. Data types supported: QSYMM16.
+     * @param[in] output Destination info tensor. Data type supported: same as @p input
+     * @param[in] weight Weight info tensor. Data types supported: Same as @p input.
+     * @param[in] bias   Bias tensor info. Data types supported: S32.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    const ICLTensor *_weight;
+    const ICLTensor *_bias;
+    ICLTensor       *_output;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLQLSTMLAYERVNORMALIZATIONKERNEL_H */

diff --git a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
index 44889b9..76e703f 100644
--- a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h"
+#include "src/core/CL/kernels/CLQuantizationLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLQuantizationLayerKernel.h b/src/core/CL/kernels/CLQuantizationLayerKernel.h
new file mode 100644
index 0000000..e9d03de
--- /dev/null
+++ b/src/core/CL/kernels/CLQuantizationLayerKernel.h

@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLQUANTIZATIONLAYERKERNEL_H
+#define ARM_COMPUTE_CLQUANTIZATIONLAYERKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the quantization layer kernel.
+ *
+ * @note The implementation supports only 3D input tensors.
+ */
+class CLQuantizationLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLQuantizationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLQuantizationLayerKernel(const CLQuantizationLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLQuantizationLayerKernel &operator=(const CLQuantizationLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    CLQuantizationLayerKernel(CLQuantizationLayerKernel &&) = default;
+    /** Default move assignment operator */
+    CLQuantizationLayerKernel &operator=(CLQuantizationLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLQuantizationLayerKernel() = default;
+    /** Set the input, output.
+     *
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
+     * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
+     *
+     * @note Output auto initialization is not supported by this kernel
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+    /** Set the input, output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
+     * @param[out] output          Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
+     *
+     * @note Output auto initialization is not supported by this kernel
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLQuantizationLayerKernel
+     *
+     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
+     * @param[in] output Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLQUANTIZATIONLAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
index ca6c6fa..38eafc6 100644
--- a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h"
+#include "src/core/CL/kernels/CLROIAlignLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.h b/src/core/CL/kernels/CLROIAlignLayerKernel.h
new file mode 100644
index 0000000..cbf0e00
--- /dev/null
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.h

@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLROIALIGNLAYERKERNEL_H
+#define ARM_COMPUTE_CLROIALIGNLAYERKERNEL_H
+
+#include "arm_compute/core/CL/ICLArray.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the RoIAlign kernel.
+ */
+class CLROIAlignLayerKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLROIAlignLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLROIAlignLayerKernel(const CLROIAlignLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLROIAlignLayerKernel &operator=(const CLROIAlignLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    CLROIAlignLayerKernel(CLROIAlignLayerKernel &&) = default;
+    /** Default move assignment operator. */
+    CLROIAlignLayerKernel &operator=(CLROIAlignLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLROIAlignLayerKernel() = default;
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  rois      ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
+     *                       as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ].
+     *                       Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8/QASYMM8_SIGNED, otherwise same as @p input
+     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
+     *
+     * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
+     * width and pooled height.
+     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
+     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
+     */
+    void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    /** Set the input and output tensors.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  rois            ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
+     *                             as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ].
+     *                             Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8/QASYMM8_SIGNED, otherwise same as @p input
+     * @param[out] output          Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  pool_info       Contains pooling operation information described in @ref ROIPoolingLayerInfo.
+     *
+     * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
+     * width and pooled height.
+     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
+     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLROIAlignLayerKernel
+     *
+     * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] rois      ROIs tensor info. Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8/QASYMM8_SIGNED,
+     *                      otherwise same as @p input
+     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
+     * @param[in] pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
+     *
+     * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
+     * width and pooled height.
+     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
+     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
+     *
+     * @return a Status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue);
+
+private:
+    const ICLTensor    *_input;
+    ICLTensor          *_output;
+    const ICLTensor    *_rois;
+    ROIPoolingLayerInfo _pool_info;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLROIALIGNLAYERKERNEL_H*/

diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
index 55fe5a5..43492a3 100644
--- a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h"
+#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.h b/src/core/CL/kernels/CLROIPoolingLayerKernel.h
new file mode 100644
index 0000000..35f42a9
--- /dev/null
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.h

@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLROIPOOLINGLAYERKERNEL_H
+#define ARM_COMPUTE_CLROIPOOLINGLAYERKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+#include "arm_compute/core/CL/ICLArray.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the ROI pooling layer kernel */
+class CLROIPoolingLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLROIPoolingLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLROIPoolingLayerKernel(const CLROIPoolingLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLROIPoolingLayerKernel &operator=(const CLROIPoolingLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLROIPoolingLayerKernel(CLROIPoolingLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLROIPoolingLayerKernel &operator=(CLROIPoolingLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLROIPoolingLayerKernel() = default;
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. Data types supported: F16/F32.
+     * @param[in]  rois      ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
+     *                       as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ]. Data types supported: U16
+     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
+     *
+     * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
+     * width and pooled height.
+     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
+     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
+     */
+    void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    /** Set the input and output tensors.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: F16/F32.
+     * @param[in]  rois            ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
+     *                             as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ]. Data types supported: U16
+     * @param[out] output          Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  pool_info       Contains pooling operation information described in @ref ROIPoolingLayerInfo.
+     *
+     * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
+     * width and pooled height.
+     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
+     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor    *_input;
+    const ICLTensor    *_rois;
+    ICLTensor          *_output;
+    ROIPoolingLayerInfo _pool_info;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLROIPOOLINGLAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLRangeKernel.cpp b/src/core/CL/kernels/CLRangeKernel.cpp
index a4c30b6..892f1c7 100644
--- a/src/core/CL/kernels/CLRangeKernel.cpp
+++ b/src/core/CL/kernels/CLRangeKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLRangeKernel.h"
+#include "src/core/CL/kernels/CLRangeKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLRangeKernel.h b/src/core/CL/kernels/CLRangeKernel.h
new file mode 100644
index 0000000..1b94a09
--- /dev/null
+++ b/src/core/CL/kernels/CLRangeKernel.h

@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLRANGEKERNEL_H
+#define ARM_COMPUTE_CLRANGEKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Kernel class for Range
+ *
+ * range generates a 1-D tensor containing a sequence of numbers that begins at 'start' and extends by increments
+ * of 'step' up to but not including 'end'.
+ */
+class CLRangeKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLRangeKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLRangeKernel(const CLRangeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLRangeKernel &operator=(const CLRangeKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLRangeKernel(CLRangeKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLRangeKernel &operator=(CLRangeKernel &&) = default;
+    /** Default destructor */
+    ~CLRangeKernel() = default;
+    /** Initialize the kernel's output tensor, start, end and step of the sequence.
+     *
+     * @param[out] output Output tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
+     * @param[in]  start  The starting value of the sequence.
+     * @param[in]  end    The ending (not including) value of the sequence.
+     * @param[in]  step   The gap between each pair of values in the sequence.
+     */
+    void configure(ICLTensor *output, float start, float end, float step);
+    /** Initialize the kernel's output tensor, start, end and step of the sequence.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[out] output          Output tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
+     * @param[in]  start           The starting value of the sequence.
+     * @param[in]  end             The ending (not including) value of the sequence.
+     * @param[in]  step            The gap between each pair of values in the sequence.
+     */
+    void configure(const CLCompileContext &compile_context, ICLTensor *output, float start, float end, float step);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLRangeKernel
+     *
+     * @param[in] output Output tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
+     * @param[in] start  The starting value of the sequence.
+     * @param[in] end    The ending (not including) value of the sequence.
+     * @param[in] step   The gap between each pair of values in the sequence.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *output, float start, float end, float step);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    float      _start;  /**< Start of sequence */
+    float      _end;    /**< End of sequence */
+    float      _step;   /**< Increment/step value */
+    ICLTensor *_output; /**< Destination tensor */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLRANGEKERNEL_H */

diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index 325e4b9..9d49a21 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
+#include "src/core/CL/kernels/CLReductionOperationKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLReductionOperationKernel.h b/src/core/CL/kernels/CLReductionOperationKernel.h
new file mode 100644
index 0000000..ff9fd61
--- /dev/null
+++ b/src/core/CL/kernels/CLReductionOperationKernel.h

@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H
+#define ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the reduction operation kernel
+ */
+class CLReductionOperationKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLReductionOperationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLReductionOperationKernel(const CLReductionOperationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLReductionOperationKernel &operator=(const CLReductionOperationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLReductionOperationKernel(CLReductionOperationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLReductionOperationKernel &operator=(CLReductionOperationKernel &&) = default;
+    /** Default destructor */
+    ~CLReductionOperationKernel() = default;
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
+     * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
+     *                    Output will have the same number of dimensions as input.
+     * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0,1,2,3
+     * @param[in]  op     Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
+     * @param[in]  width  (Optional)  In case of x-axis we also need to provide the width of the input image.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, unsigned int width = 0);
+    /** Set the input and output tensors.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
+     * @param[out] output          Destination tensor. Data types and data layouts supported: Same as @p input.
+     *                             Output will have the same number of dimensions as input.
+     * @param[in]  axis            Axis along which to reduce. Supported reduction axis : 0,1,2,3
+     * @param[in]  op              Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
+     * @param[in]  width           (Optional)  In case of x-axis we also need to provide the width of the input image.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, unsigned int width = 0);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperationKernel.
+     *
+     * @param[in] input  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
+     * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input.
+     *                   Output will have the same number of dimensions as input.
+     * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0,1,2,3
+     * @param[in] op     Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
+     * @param[in] width  (Optional)  In case of x-axis we also need to provide the width of the input image.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width = 0);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor   *_input;
+    ICLTensor         *_output;
+    unsigned int       _reduction_axis;
+    ReductionOperation _op;
+    BorderSize         _border_size;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H */

diff --git a/src/core/CL/kernels/CLRemapKernel.cpp b/src/core/CL/kernels/CLRemapKernel.cpp
index 8d3f41b..0ebeefc 100644
--- a/src/core/CL/kernels/CLRemapKernel.cpp
+++ b/src/core/CL/kernels/CLRemapKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLRemapKernel.h"
+#include "src/core/CL/kernels/CLRemapKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLRemapKernel.h b/src/core/CL/kernels/CLRemapKernel.h
new file mode 100644
index 0000000..8efcf09
--- /dev/null
+++ b/src/core/CL/kernels/CLRemapKernel.h

@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLREMAPKERNEL_H
+#define ARM_COMPUTE_CLREMAPKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform a remap on a tensor */
+class CLRemapKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLRemapKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLRemapKernel(const CLRemapKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLRemapKernel &operator=(const CLRemapKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLRemapKernel(CLRemapKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLRemapKernel &operator=(CLRemapKernel &&) = default;
+    /** Initialize the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[in]  map_x            Map for X coordinates. Data types supported: F32.
+     * @param[in]  map_y            Map for Y coordinates. Data types supported: F32.
+     * @param[out] output           Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
+     * @param[in]  policy           The interpolation type.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, bool border_undefined);
+    /** Initialize the kernel's input, output and border mode.
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[in]  map_x            Map for X coordinates. Data types supported: F32.
+     * @param[in]  map_y            Map for Y coordinates. Data types supported: F32.
+     * @param[out] output           Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
+     * @param[in]  policy           The interpolation type.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    const ICLTensor *_map_x;
+    const ICLTensor *_map_y;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLREMAPKERNEL_H */

diff --git a/src/core/CL/kernels/CLReorgLayerKernel.cpp b/src/core/CL/kernels/CLReorgLayerKernel.cpp
index ade7761..662c790 100644
--- a/src/core/CL/kernels/CLReorgLayerKernel.cpp
+++ b/src/core/CL/kernels/CLReorgLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLReorgLayerKernel.h"
+#include "src/core/CL/kernels/CLReorgLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLReorgLayerKernel.h b/src/core/CL/kernels/CLReorgLayerKernel.h
new file mode 100644
index 0000000..455a617
--- /dev/null
+++ b/src/core/CL/kernels/CLReorgLayerKernel.h

@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLREORGLAYERKERNEL_H
+#define ARM_COMPUTE_CLREORGLAYERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform a reorg layer */
+class CLReorgLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLReorgLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLReorgLayerKernel(const CLReorgLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLReorgLayerKernel &operator=(const CLReorgLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLReorgLayerKernel(CLReorgLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLReorgLayerKernel &operator=(CLReorgLayerKernel &&) = default;
+    /** Initialize the kernel's input, output.
+     *
+     * @param[in]  input  Source tensor. Data types supported: All.
+     * @param[out] output Destination tensor with tensor shape:
+     *                    [width_input / stride, height_input / stride, channels_input * stride * stride, batch_size]. This means the output has
+     *                    the same number of input elements. Data types supported: same as @p input.
+     * @param[in]  stride Stride value to use for reorganizing the values in the output tensor.
+     *                    It defines the spatial distance between 2 consecutive pixels in the x and y direction
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, int32_t stride);
+    /** Initialize the kernel's input, output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: All.
+     * @param[out] output          Destination tensor with tensor shape:
+     *                             [width_input / stride, height_input / stride, channels_input * stride * stride, batch_size]. This means the output has
+     *                             the same number of input elements. Data types supported: same as @p input.
+     * @param[in]  stride          Stride value to use for reorganizing the values in the output tensor.
+     *                             It defines the spatial distance between 2 consecutive pixels in the x and y direction
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t stride);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLReorgLayerKernel
+     *
+     * @param[in] input  Source tensor. Data types supported: All.
+     * @param[in] output Destination tensor with tensor shape:
+     *                   [width_input / stride, height_input / stride, channels_input * stride * stride, batch_size]. This means the output has
+     *                   the same number of input elements. Data types supported: same as @p input. Data types supported: same as @p input.
+     * @param[in] stride Stride value to use for reorganizing the values in the output tensor
+     *                   It defines the spatial distance between 2 consecutive pixels in the x and y direction
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t stride);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLREORGLAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLReshapeLayerKernel.cpp b/src/core/CL/kernels/CLReshapeLayerKernel.cpp
index b14013b..58d7843 100644
--- a/src/core/CL/kernels/CLReshapeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLReshapeLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
+#include "src/core/CL/kernels/CLReshapeLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLReshapeLayerKernel.h b/src/core/CL/kernels/CLReshapeLayerKernel.h
new file mode 100644
index 0000000..902c446
--- /dev/null
+++ b/src/core/CL/kernels/CLReshapeLayerKernel.h

@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLRESHAPELAYERKERNEL_H
+#define ARM_COMPUTE_CLRESHAPELAYERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to perform tensor reshaping */
+class CLReshapeLayerKernel : public ICLKernel
+{
+public:
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor info. Data type supported: All.
+     * @param[out] output          Destination tensor info. Data type supported: Same as @p input
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLReshapeLayerKernel
+     *
+     * @param[in] input  Source tensor info. Data type supported: All
+     * @param[in] output Destination tensor info. Data type supported: Same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLRESHAPELAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLReverseKernel.cpp b/src/core/CL/kernels/CLReverseKernel.cpp
index f824098..9a87625 100644
--- a/src/core/CL/kernels/CLReverseKernel.cpp
+++ b/src/core/CL/kernels/CLReverseKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLReverseKernel.h"
+#include "src/core/CL/kernels/CLReverseKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLReverseKernel.h b/src/core/CL/kernels/CLReverseKernel.h
new file mode 100644
index 0000000..4a21e4f
--- /dev/null
+++ b/src/core/CL/kernels/CLReverseKernel.h

@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLREVERSEKERNEL_H
+#define ARM_COMPUTE_CLREVERSEKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the reverse kernel */
+class CLReverseKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLReverseKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLReverseKernel(const CLReverseKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLReverseKernel &operator=(const CLReverseKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLReverseKernel(CLReverseKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLReverseKernel &operator=(CLReverseKernel &&) = default;
+    /** Default destructor */
+    ~CLReverseKernel() = default;
+    /** Initialise the kernel's inputis and output
+     *
+     * @param[in]  input  Input tensor. Data types supported: All.
+     * @param[out] output Output tensor. Data type supported: Same as @p input
+     * @param[in]  axis   Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis);
+    /** Initialise the kernel's inputis and output
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor. Data types supported: All.
+     * @param[out] output          Output tensor. Data type supported: Same as @p input
+     * @param[in]  axis            Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLReverseKernel
+     *
+     * @param[in] input  Input tensor info. Data types supported: All.
+     * @param[in] output Output tensor info. Data type supported: Same as @p input
+     * @param[in] axis   Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+public:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    const ICLTensor *_axis;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLREVERSEKERNEL_H */

diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
index 8233f21..5a7d583 100644
--- a/src/core/CL/kernels/CLScaleKernel.cpp
+++ b/src/core/CL/kernels/CLScaleKernel.cpp

@@ -21,11 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
+#include "src/core/CL/kernels/CLScaleKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
@@ -33,6 +32,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
+#include "src/core/CL/ICLKernel.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 

diff --git a/src/core/CL/kernels/CLScaleKernel.h b/src/core/CL/kernels/CLScaleKernel.h
new file mode 100644
index 0000000..a72e393
--- /dev/null
+++ b/src/core/CL/kernels/CLScaleKernel.h

@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLSCALEKERNEL_H
+#define ARM_COMPUTE_CLSCALEKERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the scale kernel */
+class CLScaleKernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's inputs, output and interpolation policy
+     *
+     * @param[in]  input  Source tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
+     * @param[out] output Destination tensor. Data types supported: Same as @p input
+     *                    All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]  info   @ref ScaleKernelInfo Kernel descriptor to be used to configure.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info);
+    /** Initialise the kernel's inputs, output and interpolation policy
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
+     * @param[out] output          Destination tensor. Data types supported: Same as @p input
+     *                             All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]  info            @ref ScaleKernelInfo Kernel descriptor to be used to configure.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLScaleKernel
+     *
+     * @param[in] input  Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
+     * @param[in] output Destination tensor info. Data types supported: Same as @p input
+     *                   All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in] info   @ref ScaleKernelInfo Kernel descriptor to be used to validate
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info);
+    /** Input tensor accessor.
+     *
+     * @return Pointer to input tensor.
+     */
+    const ICLTensor *input() const;
+    /** Output tensor accessor.
+     *
+     * @return Pointer to output tensor.
+     */
+    const ICLTensor *output() const;
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+    // Getter for interpolation policy
+    InterpolationPolicy get_interpolation_policy() const
+    {
+        return _interpolation_policy;
+    }
+
+private:
+    InterpolationPolicy _interpolation_policy = InterpolationPolicy::BILINEAR;
+    DataLayout          _data_layout          = DataLayout::UNKNOWN;
+    bool                _align_corners        = false;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLSCALEKERNEL_H */

diff --git a/src/core/CL/kernels/CLScharr3x3Kernel.cpp b/src/core/CL/kernels/CLScharr3x3Kernel.cpp
index 1e33af3..7ceddc9 100644
--- a/src/core/CL/kernels/CLScharr3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLScharr3x3Kernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h"
+#include "src/core/CL/kernels/CLScharr3x3Kernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLScharr3x3Kernel.h b/src/core/CL/kernels/CLScharr3x3Kernel.h
new file mode 100644
index 0000000..a670da5
--- /dev/null
+++ b/src/core/CL/kernels/CLScharr3x3Kernel.h

@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLSCHARR3X3KERNEL_H
+#define ARM_COMPUTE_CLSCHARR3X3KERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to run a 3x3 Scharr filter on a tensor.
+ *
+ * @f[
+ *      \mathbf{G}_x=\begin{vmatrix}
+ *      -3 & 0 & +3\\
+ *      -10& 0 & +10\\
+ *      -3 & 0 & +3
+ *      \end{vmatrix}
+ * @f]
+ * @f[
+ *      \mathbf{G}_y=\begin{vmatrix}
+ *      -3 & -10 & -3\\
+ *       0 & 0 & 0\\
+ *      +3 & +10 & +3
+ *      \end{vmatrix}
+ * @f]
+ */
+class CLScharr3x3Kernel : public ICLKernel
+{
+public:
+    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
+    CLScharr3x3Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLScharr3x3Kernel(const CLScharr3x3Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLScharr3x3Kernel &operator=(const CLScharr3x3Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLScharr3x3Kernel(CLScharr3x3Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLScharr3x3Kernel &operator=(CLScharr3x3Kernel &&) = default;
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    bool             _run_scharr_x; /**< Do we need to run Scharr X ? */
+    bool             _run_scharr_y; /**< Do we need to run Scharr Y ? */
+    const ICLTensor *_input;        /**< Input image */
+    ICLTensor       *_output_x;     /**< Output image for scharr X */
+    ICLTensor       *_output_y;     /**< Output image for scharr Y */
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLSCHARR3X3KERNEL_H */

diff --git a/src/core/CL/kernels/CLSelectKernel.cpp b/src/core/CL/kernels/CLSelectKernel.cpp
index d9a1044..53e5414 100644
--- a/src/core/CL/kernels/CLSelectKernel.cpp
+++ b/src/core/CL/kernels/CLSelectKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLSelectKernel.h"
+#include "src/core/CL/kernels/CLSelectKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLSelectKernel.h b/src/core/CL/kernels/CLSelectKernel.h
new file mode 100644
index 0000000..93ae27f
--- /dev/null
+++ b/src/core/CL/kernels/CLSelectKernel.h

@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLSELECTKERNEL_H
+#define ARM_COMPUTE_CLSELECTKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ICLTensor;
+
+/** OpenCL interface for executing the select kernel
+ *
+ * Select is computed by:
+ * @f[ output(i) = condition(i) ? x(i) : y(i) @f]
+ **/
+class CLSelectKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLSelectKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLSelectKernel(const CLSelectKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLSelectKernel &operator=(const CLSelectKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLSelectKernel(CLSelectKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLSelectKernel &operator=(CLSelectKernel &&) = default;
+    /** Default destructor */
+    ~CLSelectKernel() = default;
+    /** Initialise the kernel's inputs and output.
+     *
+     * @param[in]  c      Condition input tensor. Data types supported: U8.
+     * @param[in]  x      First input tensor. Data types supported: All.
+     * @param[out] y      Second input tensor. Data types supported: Same as @p x
+     * @param[in]  output Output tensor. Data types supported: Same as @p x.
+     */
+    void configure(const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output);
+    /** Initialise the kernel's inputs and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  c               Condition input tensor. Data types supported: U8.
+     * @param[in]  x               First input tensor. Data types supported: All.
+     * @param[out] y               Second input tensor. Data types supported: Same as @p x
+     * @param[in]  output          Output tensor. Data types supported: Same as @p x.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLSelectKernel
+     *
+     * @param[in] c      Condition input tensor. Data types supported: U8.
+     * @param[in] x      First input tensor. Data types supported: All.
+     * @param[in] y      Second input tensor. Data types supported: Same as @p x
+     * @param[in] output Output tensor. Data types supported: Same as @p x.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_c;             /**< Condition tensor */
+    const ICLTensor *_x;             /**< Source tensor 1 */
+    const ICLTensor *_y;             /**< Source tensor 2 */
+    ICLTensor       *_output;        /**< Destination tensor */
+    bool             _has_same_rank; /**< Flag that indicates if condition tensor and other inputs have the same rank */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLWHEREKERNEL_H */

diff --git a/src/core/CL/kernels/CLSobel3x3Kernel.cpp b/src/core/CL/kernels/CLSobel3x3Kernel.cpp
index 89e5207..a87677a 100644
--- a/src/core/CL/kernels/CLSobel3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLSobel3x3Kernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h"
+#include "src/core/CL/kernels/CLSobel3x3Kernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLSobel3x3Kernel.h b/src/core/CL/kernels/CLSobel3x3Kernel.h
new file mode 100644
index 0000000..fed8068
--- /dev/null
+++ b/src/core/CL/kernels/CLSobel3x3Kernel.h

@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLSOBEL3X3KERNEL_H
+#define ARM_COMPUTE_CLSOBEL3X3KERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to run a 3x3 Sobel filter on a tensor. */
+class CLSobel3x3Kernel : public ICLKernel
+{
+public:
+    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
+    CLSobel3x3Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLSobel3x3Kernel(const CLSobel3x3Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLSobel3x3Kernel &operator=(const CLSobel3x3Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLSobel3x3Kernel(CLSobel3x3Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLSobel3x3Kernel &operator=(CLSobel3x3Kernel &&) = default;
+    /** Default destructor */
+    ~CLSobel3x3Kernel() = default;
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input;       /**< Input tensor */
+    ICLTensor       *_output_x;    /**< Output tensor for Sobel X */
+    ICLTensor       *_output_y;    /**< Output tensor for Sobel Y */
+    bool             _run_sobel_x; /**< Do we need to run Sobel X ? */
+    bool             _run_sobel_y; /**< Do we need to run Sobel Y ? */
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLSOBEL3X3KERNEL_H */

diff --git a/src/core/CL/kernels/CLSobel5x5Kernel.cpp b/src/core/CL/kernels/CLSobel5x5Kernel.cpp
index 3e765e4..c450bec 100644
--- a/src/core/CL/kernels/CLSobel5x5Kernel.cpp
+++ b/src/core/CL/kernels/CLSobel5x5Kernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h"
+#include "src/core/CL/kernels/CLSobel5x5Kernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLSobel5x5Kernel.h b/src/core/CL/kernels/CLSobel5x5Kernel.h
new file mode 100644
index 0000000..a163ac9
--- /dev/null
+++ b/src/core/CL/kernels/CLSobel5x5Kernel.h

@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLSOBEL5X5KERNEL_H
+#define ARM_COMPUTE_CLSOBEL5X5KERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to run the horizontal pass of 5x5 Sobel filter on a tensor. */
+class CLSobel5x5HorKernel : public ICLKernel
+{
+public:
+    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
+    CLSobel5x5HorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLSobel5x5HorKernel(const CLSobel5x5HorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLSobel5x5HorKernel &operator=(const CLSobel5x5HorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLSobel5x5HorKernel(CLSobel5x5HorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLSobel5x5HorKernel &operator=(CLSobel5x5HorKernel &&) = default;
+    /** Default destructor */
+    ~CLSobel5x5HorKernel() = default;
+
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input;       /**< Input tensor */
+    ICLTensor       *_output_x;    /**< X output of horizontal pass */
+    ICLTensor       *_output_y;    /**< Y output of horizontal pass */
+    bool             _run_sobel_x; /**< Do we need to run Sobel X ? */
+    bool             _run_sobel_y; /**< Do we need to run Sobel Y ? */
+    BorderSize       _border_size; /**< Border size */
+};
+
+/** Interface for the kernel to run the vertical pass of 5x5 Sobel filter on a tensor. */
+class CLSobel5x5VertKernel : public ICLKernel
+{
+public:
+    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
+    CLSobel5x5VertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLSobel5x5VertKernel(const CLSobel5x5VertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLSobel5x5VertKernel &operator=(const CLSobel5x5VertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLSobel5x5VertKernel(CLSobel5x5VertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLSobel5x5VertKernel &operator=(CLSobel5x5VertKernel &&) = default;
+    /** Default destructor */
+    ~CLSobel5x5VertKernel() = default;
+
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set and the corresponding input.
+     *
+     * @param[in]  input_x          (Optional) Input for X (X output of horizontal pass). Data types supported: S16.
+     * @param[in]  input_y          (Optional) Input for Y (Y output of horizontal pass). Data types supported: S16.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set and the corresponding input.
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  input_x          (Optional) Input for X (X output of horizontal pass). Data types supported: S16.
+     * @param[in]  input_y          (Optional) Input for Y (Y output of horizontal pass). Data types supported: S16.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input_x;     /**< X input (X output of the horizontal pass) */
+    const ICLTensor *_input_y;     /**< Y input (Y output of the horizontal pass) */
+    ICLTensor       *_output_x;    /**< X output of sobel */
+    ICLTensor       *_output_y;    /**< Y output of sobel */
+    bool             _run_sobel_x; /**< Do we need to run sobel X? */
+    bool             _run_sobel_y; /**< Do we need to run sobel Y? */
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLSOBEL5X5KERNEL_H */

diff --git a/src/core/CL/kernels/CLSobel7x7Kernel.cpp b/src/core/CL/kernels/CLSobel7x7Kernel.cpp
index 37ceaba..1cfa74f 100644
--- a/src/core/CL/kernels/CLSobel7x7Kernel.cpp
+++ b/src/core/CL/kernels/CLSobel7x7Kernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h"
+#include "src/core/CL/kernels/CLSobel7x7Kernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLSobel7x7Kernel.h b/src/core/CL/kernels/CLSobel7x7Kernel.h
new file mode 100644
index 0000000..c85f0ae
--- /dev/null
+++ b/src/core/CL/kernels/CLSobel7x7Kernel.h

@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLSOBEL7X7KERNEL_H
+#define ARM_COMPUTE_CLSOBEL7X7KERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to run the horizontal pass of 7x7 Sobel filter on a tensor. */
+class CLSobel7x7HorKernel : public ICLKernel
+{
+public:
+    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
+    CLSobel7x7HorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLSobel7x7HorKernel(const CLSobel7x7HorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLSobel7x7HorKernel &operator=(const CLSobel7x7HorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLSobel7x7HorKernel(CLSobel7x7HorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLSobel7x7HorKernel &operator=(CLSobel7x7HorKernel &&) = default;
+    /** Default destructor */
+    ~CLSobel7x7HorKernel() = default;
+
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S32.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S32.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S32.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S32.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input;       /**< Input tensor */
+    ICLTensor       *_output_x;    /**< X output of horizontal pass */
+    ICLTensor       *_output_y;    /**< Y output of horizontal pass */
+    bool             _run_sobel_x; /**< Do we need to run Sobel X ? */
+    bool             _run_sobel_y; /**< Do we need to run Sobel Y ? */
+    BorderSize       _border_size; /**< Border size */
+};
+
+/** Interface for the kernel to run the vertical pass of 7x7 Sobel filter on a tensor. */
+class CLSobel7x7VertKernel : public ICLKernel
+{
+public:
+    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
+    CLSobel7x7VertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLSobel7x7VertKernel(const CLSobel7x7VertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLSobel7x7VertKernel &operator=(const CLSobel7x7VertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLSobel7x7VertKernel(CLSobel7x7VertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLSobel7x7VertKernel &operator=(CLSobel7x7VertKernel &&) = default;
+    /** Default destructor */
+    ~CLSobel7x7VertKernel() = default;
+
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set and the corresponding input.
+     *
+     * @param[in]  input_x          (Optional) Input for X (X output of horizontal pass). Data types supported: S32.
+     * @param[in]  input_y          (Optional) Input for Y (Y output of horizontal pass). Data types supported: S32.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S32.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S32.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set and the corresponding input.
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  input_x          (Optional) Input for X (X output of horizontal pass). Data types supported: S32.
+     * @param[in]  input_y          (Optional) Input for Y (Y output of horizontal pass). Data types supported: S32.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S32.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S32.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input_x;     /**< X input (X output of the horizontal pass) */
+    const ICLTensor *_input_y;     /**< Y input (Y output of the horizontal pass) */
+    ICLTensor       *_output_x;    /**< X output of sobel */
+    ICLTensor       *_output_y;    /**< Y output of sobel */
+    bool             _run_sobel_x; /**< Do we need to run sobel X? */
+    bool             _run_sobel_y; /**< Do we need to run sobel Y? */
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLSOBEL7X7KERNEL_H */

diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
index 5c0acda..d9f498c 100644
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
+#include "src/core/CL/kernels/CLSoftmaxLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.h b/src/core/CL/kernels/CLSoftmaxLayerKernel.h
new file mode 100644
index 0000000..29e0f63
--- /dev/null
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.h

@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLSOFTMAXLAYERKERNEL_H
+#define ARM_COMPUTE_CLSOFTMAXLAYERKERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/CL/ICLSimple3DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for max, shifting, exponentiating and summing the logits */
+class CLLogits1DMaxShiftExpSumKernel : public ICLKernel
+{
+public:
+    /** Info for whether a parallel reduction will be run and the vector size of the execution. */
+    using ParallelReductionInfo = std::tuple<bool, unsigned int>;
+
+public:
+    /** Default constructor */
+    CLLogits1DMaxShiftExpSumKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogits1DMaxShiftExpSumKernel(const CLLogits1DMaxShiftExpSumKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogits1DMaxShiftExpSumKernel &operator=(const CLLogits1DMaxShiftExpSumKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLLogits1DMaxShiftExpSumKernel(CLLogits1DMaxShiftExpSumKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLLogits1DMaxShiftExpSumKernel &operator=(CLLogits1DMaxShiftExpSumKernel &&) = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]     input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[in,out] max    Max values tensor. Data types supported: same as @p input
+     * @param[out]    output Destination tensor. Data types supported: same as @p input
+     * @param[out]    sum    Sum of 1D logits tensor. Data types supported: same as @p input
+     * @param[in]     info   Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
+     */
+    void configure(const ICLTensor *input, ICLTensor *max, ICLTensor *output, ICLTensor *sum, const SoftmaxKernelInfo &info);
+    /** Set the input and output tensors.
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in]     input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[in,out] max             Max values tensor. Data types supported: same as @p input
+     * @param[out]    output          Destination tensor. Data types supported: same as @p input
+     * @param[out]    sum             Sum of 1D logits tensor. Data types supported: same as @p input
+     * @param[in]     info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *max, ICLTensor *output, ICLTensor *sum, const SoftmaxKernelInfo &info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DMaxShiftExpSumKernel
+     *
+     * @param[in] input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[in] max    Max values tensor. Data types supported: same as @p input
+     * @param[in] output Destination tensor. Data types supported: same as @p input
+     * @param[in] sum    Sum of 1D logits tensor. Data types supported: same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum);
+    /** Checks if the given size is eligible for parallel reduction
+     *
+     * @note  Serial reduction is launched for width < (_grid_size * _serial_vector_size).
+     * @note  Parallel reduction is launched for width >= (_grid_size * _serial_vector_size) and vector_size is forced to 4.
+     *
+     * @param[in] size Size to check
+     *
+     * @return A two-element tuple where the first element is a boolean specifying if a parallel reduction will be run,
+     *         while the second element is the vector size of the execution.
+     */
+    static ParallelReductionInfo is_parallel_reduction(size_t size);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_max;
+    ICLTensor       *_output;
+    ICLTensor       *_sum;
+
+private:
+    static const unsigned int _grid_size;
+    static const unsigned int _serial_vector_size;
+    static const unsigned int _parallel_vector_size;
+};
+/** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */
+class CLLogits1DNormKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLLogits1DNormKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogits1DNormKernel(const CLLogits1DNormKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogits1DNormKernel &operator=(const CLLogits1DNormKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLLogits1DNormKernel(CLLogits1DNormKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLLogits1DNormKernel &operator=(CLLogits1DNormKernel &&) = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: S32/F16/F32. If this kernel is used for log softmax, only F32/F16 is supported.
+     * @param[in]  sum    Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
+     * @param[out] output Destination tensor. Data types supported: QASYMM8/QASYMM8_SIGNED for S32 @p input, or same as @p input
+     * @param[in]  info   Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
+     */
+    void configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, const SoftmaxKernelInfo &info);
+    /** Set the input and output tensors.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: S32/F16/F32. If this kernel is used for log softmax, only F32/F16 is supported.
+     * @param[in]  sum             Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
+     * @param[out] output          Destination tensor. Data types supported: QASYMM8/QASYMM8_SIGNED for S32 @p input, or same as @p input
+     * @param[in]  info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, const SoftmaxKernelInfo &info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DNormKernel
+     *
+     * @param[in] input  Source tensor. Data types supported: S32/F16/F32. If this kernel is used for log softmax, only F32/F16 is supported.
+     * @param[in] sum    Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
+     * @param[in] output Destination tensor. Data types supported: QASYMM8 for S32 @p input, or same as @p input
+     * @param[in] info   Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, const SoftmaxKernelInfo &info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    const ICLTensor *_sum;
+    ICLTensor       *_output;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLSOFTMAXLAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
index c6f70c3..91b889a 100644
--- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
+#include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h
new file mode 100644
index 0000000..4819c80
--- /dev/null
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h

@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLSPACETOBATCHLAYERKERNEL_H
+#define ARM_COMPUTE_CLSPACETOBATCHLAYERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the space to batch kernel */
+class CLSpaceToBatchLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLSpaceToBatchLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLSpaceToBatchLayerKernel(const CLSpaceToBatchLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLSpaceToBatchLayerKernel &operator=(const CLSpaceToBatchLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLSpaceToBatchLayerKernel(CLSpaceToBatchLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLSpaceToBatchLayerKernel &operator=(CLSpaceToBatchLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLSpaceToBatchLayerKernel() = default;
+    /** Initialise the kernel's inputs and output.
+     *
+     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
+     * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
+     * @param[in]  paddings    2-D tensor with shape [2, M]. Data types supported: S32
+     * @param[out] output      Tensor output. Data types supported: same as @p input
+     */
+    void configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output);
+    /** Initialise the kernel's inputs and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Tensor input. Supported tensor rank: 4. Data types supported: All.
+     * @param[in]  block_shape     1-D tensor with shape [M]. Data types supported: S32
+     * @param[in]  paddings        2-D tensor with shape [2, M]. Data types supported: S32
+     * @param[out] output          Tensor output. Data types supported: same as @p input
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output);
+    /** Initialise the kernel's input and output. (Static block shape and paddings)
+     *
+     * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
+     * @param[in]  block_shape_x Block shape x value.
+     * @param[in]  block_shape_y Block shape y value.
+     * @param[in]  padding_left  The left padding of the output tensor.
+     * @param[in]  padding_right The right padding of the output tensor.
+     * @param[out] output        Tensor output. Data types supported: same as @p input
+     */
+    void configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output);
+    /** Initialise the kernel's input and output. (Static block shape and paddings)
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Tensor input. Supported tensor rank: 4. Data types supported: All.
+     * @param[in]  block_shape_x   Block shape x value.
+     * @param[in]  block_shape_y   Block shape y value.
+     * @param[in]  padding_left    The left padding of the output tensor.
+     * @param[in]  padding_right   The right padding of the output tensor.
+     * @param[out] output          Tensor output. Data types supported: same as @p input
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+                   ICLTensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel
+     *
+     * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
+     * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32
+     * @param[in] paddings    2-D tensor with shape [2, M]. Data types supported: S32
+     * @param[in] output      Tensor output. Data types supported: same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel (Static block shape and paddings)
+     *
+     * @param[in] input         Tensor input. Supported tensor rank: 4. Data types supported: All.
+     * @param[in] block_shape_x Block shape x value.
+     * @param[in] block_shape_y Block shape y value.
+     * @param[in] padding_left  The left padding of the output tensor.
+     * @param[in] padding_right The right padding of the output tensor.
+     * @param[in] output        Tensor output. Data types supported: same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;       /**< Source tensor */
+    const ICLTensor *_block_shape; /**< Block shape tensor */
+    const ICLTensor *_paddings;    /**< Paddings tensor */
+    ICLTensor       *_output;      /**< Destination tensor */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLSPACETOBATCHLAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
index 2d46aad..1c648e0 100644
--- a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
+#include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h
new file mode 100644
index 0000000..bb1ac5f
--- /dev/null
+++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h

@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLSPACETODEPTHLAYERKERNEL_H
+#define ARM_COMPUTE_CLSPACETODEPTHLAYERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the space to depth kernel */
+class CLSpaceToDepthLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLSpaceToDepthLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLSpaceToDepthLayerKernel(const CLSpaceToDepthLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLSpaceToDepthLayerKernel &operator=(const CLSpaceToDepthLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLSpaceToDepthLayerKernel(CLSpaceToDepthLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLSpaceToDepthLayerKernel &operator=(CLSpaceToDepthLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLSpaceToDepthLayerKernel() = default;
+    /** Initialise the kernel's inputs and output.
+     *
+     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
+     * @param[out] output      Tensor output. Data types supported: same as @p input
+     * @param[in]  block_shape Block shape value.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+    /** Initialise the kernel's inputs and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Tensor input. Supported tensor rank: 4. Data types supported: All.
+     * @param[out] output          Tensor output. Data types supported: same as @p input
+     * @param[in]  block_shape     Block shape value.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToDepthLayerKernel.
+     *
+     * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
+     * @param[in] output      Tensor output info. Data types supported: same as @p input
+     * @param[in] block_shape Block shape value.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;       /**< Source tensor */
+    ICLTensor       *_output;      /**< Destination tensor */
+    int32_t          _block_shape; /**< Block shape */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLSPACETODEPTHLAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLStackLayerKernel.cpp b/src/core/CL/kernels/CLStackLayerKernel.cpp
index 5055065..9bdcc8d 100644
--- a/src/core/CL/kernels/CLStackLayerKernel.cpp
+++ b/src/core/CL/kernels/CLStackLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLStackLayerKernel.h"
+#include "src/core/CL/kernels/CLStackLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLStackLayerKernel.h b/src/core/CL/kernels/CLStackLayerKernel.h
new file mode 100644
index 0000000..2865127
--- /dev/null
+++ b/src/core/CL/kernels/CLStackLayerKernel.h

@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_CLSTACKLAYERKERNEL_H
+#define ARM_COMPUTE_CLSTACKLAYERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to stacks a rank-R tensor into one with rank-(R+1) along the axis dimension.*/
+class CLStackLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLStackLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLStackLayerKernel(const CLStackLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLStackLayerKernel &operator=(const CLStackLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLStackLayerKernel(CLStackLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLStackLayerKernel &operator=(CLStackLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLStackLayerKernel() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * @note Supported input tensor rank: up to 4
+     *
+     * @param[in]  input       Input tensor. Data types supported: All.
+     * @param[in]  axis        The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
+     * @param[in]  idx_input   Index of the input tensor in the list of tensors to stack.
+     *                         All tensors in the list must have the same shape
+     * @param[in]  num_tensors Number of tensors to stack
+     * @param[out] output      Output tensor. Data types supported: Same as @p input.
+     *
+     */
+    void configure(const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
+    /** Initialise the kernel's inputs and output
+     *
+     * @note Supported input tensor rank: up to 4
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor. Data types supported: All.
+     * @param[in]  axis            The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
+     * @param[in]  idx_input       Index of the input tensor in the list of tensors to stack.
+     *                             All tensors in the list must have the same shape
+     * @param[in]  num_tensors     Number of tensors to stack
+     * @param[out] output          Output tensor. Data types supported: Same as @p input.
+     *
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLStackLayerKernel
+     *
+     * @note Supported input tensor rank: up to 4
+     *
+     * @param[in] input       Input tensor info. Data types supported: All.
+     * @param[in] axis        The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
+     * @param[in] idx_input   Index of the input tensor in the list of tensors to stack
+     *                        All tensors in the list must have the same shape
+     * @param[in] num_tensors Number of tensors to stack
+     * @param[in] output      Output tensor info. Data types supported: Same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLSTACKLAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp
index b632e05..c87fcb9 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.cpp
+++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
+#include "src/core/CL/kernels/CLStridedSliceKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"

diff --git a/src/core/CL/kernels/CLStridedSliceKernel.h b/src/core/CL/kernels/CLStridedSliceKernel.h
new file mode 100644
index 0000000..599cf34
--- /dev/null
+++ b/src/core/CL/kernels/CLStridedSliceKernel.h

@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_STRIDED_SLICE_KERNEL_H
+#define ARM_COMPUTE_CL_STRIDED_SLICE_KERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+/** Interface for the kernel to perform tensor strided slicing */
+class CLStridedSliceKernel : public ICLKernel
+{
+public:
+    /** Configure kernel
+     *
+     * @note Supported tensor rank: up to 4
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  input            Source tensor info. Data type supported: All.
+     * @param[out] output           Destination tensor info. Data type supported: Same as @p input
+     * @param[in]  starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in]  ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in]  strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in]  begin_mask       If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in]  end_mask         If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in]  shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
+     *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
+                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                   int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
+     *
+     * @note Supported tensor rank: up to 4
+     *
+     * @param[in] input            Source tensor. Data type supported: All.
+     * @param[in] output           Destination tensor. Data type supported: Same as @p input
+     * @param[in] starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] begin_mask       If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in] end_mask         If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
+     *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                           int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CL_STRIDED_SLICE_KERNEL_H */

diff --git a/src/core/CL/kernels/CLTableLookupKernel.cpp b/src/core/CL/kernels/CLTableLookupKernel.cpp
index 3b8ca60..b82f4c9 100644
--- a/src/core/CL/kernels/CLTableLookupKernel.cpp
+++ b/src/core/CL/kernels/CLTableLookupKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
+#include "src/core/CL/kernels/CLTableLookupKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLLut.h"

diff --git a/src/core/CL/kernels/CLTableLookupKernel.h b/src/core/CL/kernels/CLTableLookupKernel.h
new file mode 100644
index 0000000..c8d15cb
--- /dev/null
+++ b/src/core/CL/kernels/CLTableLookupKernel.h

@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLTABLELOOKUPKERNEL_H
+#define ARM_COMPUTE_CLTABLELOOKUPKERNEL_H
+
+#include "src/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+class ICLLut;
+
+/** Interface for the kernel to perform table lookup calculations. */
+class CLTableLookupKernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's input, lut and output.
+     *
+     * @param[in]  input  An input tensor. Data types supported: U8, S16.
+     * @param[in]  lut    The input LUT. Data types supported: U8, S16.
+     * @param[out] output The output tensor. Data types supported: U8, S16.
+     */
+    void configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output);
+    /** Initialise the kernel's input, lut and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           An input tensor. Data types supported: U8, S16.
+     * @param[in]  lut             The input LUT. Data types supported: U8, S16.
+     * @param[out] output          The output tensor. Data types supported: U8, S16.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLLut *lut, ICLTensor *output);
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLTABLELOOKUPKERNEL_H */

diff --git a/src/core/CL/kernels/CLThresholdKernel.cpp b/src/core/CL/kernels/CLThresholdKernel.cpp
index de81644..72c22f0 100644
--- a/src/core/CL/kernels/CLThresholdKernel.cpp
+++ b/src/core/CL/kernels/CLThresholdKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLThresholdKernel.h"
+#include "src/core/CL/kernels/CLThresholdKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"

diff --git a/src/core/CL/kernels/CLThresholdKernel.h b/src/core/CL/kernels/CLThresholdKernel.h
new file mode 100644
index 0000000..511eaed
--- /dev/null
+++ b/src/core/CL/kernels/CLThresholdKernel.h

@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLTHRESHOLDKERNEL_H
+#define ARM_COMPUTE_CLTHRESHOLDKERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ICLTensor;
+
+/** Interface for the thresholding kernel. */
+class CLThresholdKernel : public ICLSimple2DKernel
+{
+public:
+    /**Initialise the kernel's input, output and threshold parameters.
+     *
+     * @param[in]  input  An input tensor. Data types supported: U8
+     * @param[out] output The output tensor. Data types supported: U8.
+     * @param[in]  info   Threshold descriptor
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info);
+    /**Initialise the kernel's input, output and threshold parameters.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           An input tensor. Data types supported: U8
+     * @param[out] output          The output tensor. Data types supported: U8.
+     * @param[in]  info            Threshold descriptor
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info);
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NETHRESHOLDKERNEL_H */

diff --git a/src/core/CL/kernels/CLTileKernel.cpp b/src/core/CL/kernels/CLTileKernel.cpp
index 43c8953..c0c3d2e 100644
--- a/src/core/CL/kernels/CLTileKernel.cpp
+++ b/src/core/CL/kernels/CLTileKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLTileKernel.h"
+#include "src/core/CL/kernels/CLTileKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/helpers/AutoConfiguration.h"

diff --git a/src/core/CL/kernels/CLTileKernel.h b/src/core/CL/kernels/CLTileKernel.h
new file mode 100644
index 0000000..41752ca
--- /dev/null
+++ b/src/core/CL/kernels/CLTileKernel.h

@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLTILEKERNEL_H
+#define ARM_COMPUTE_CLTILEKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform a Tile operation */
+class CLTileKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLTileKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLTileKernel(const CLTileKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLTileKernel &operator=(const CLTileKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLTileKernel(CLTileKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLTileKernel &operator=(CLTileKernel &&) = default;
+    /** Default destructor */
+    ~CLTileKernel() = default;
+    /** Set the source, destination of the kernel
+     *
+     * @param[in]  input     Source tensor. Data type supported: All.
+     * @param[in]  multiples Contains the number of times the input tensor should be replicated on the given dimension.
+     *                       Cannot have more than 4 elements (tiling in dimensions greater than 4 is not supported).
+     * @param[out] output    Destination tensor. Same as @p input
+     *
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const Multiples &multiples);
+    /** Set the source, destination of the kernel
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data type supported: All.
+     * @param[in]  multiples       Contains the number of times the input tensor should be replicated on the given dimension.
+     *                             Cannot have more than 4 elements (tiling in dimensions greater than 4 is not supported).
+     * @param[out] output          Destination tensor. Same as @p input
+     *
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLTileKernel
+     *
+     * @param[in] input     Source tensor info. Data type supported: All.
+     * @param[in] multiples Contains the number of times the input tensor should be replicated on the given dimension.
+     *                      Cannot have more than 4 elements (tiling in dimensions greater than 4 is not supported).
+     * @param[in] output    Destination tensor info. Same as @p input
+     *
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Multiples &multiples);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLTILEKERNEL_H */

diff --git a/src/core/CL/kernels/CLTransposeKernel.cpp b/src/core/CL/kernels/CLTransposeKernel.cpp
index bd91019..8d967e9 100644
--- a/src/core/CL/kernels/CLTransposeKernel.cpp
+++ b/src/core/CL/kernels/CLTransposeKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
+#include "src/core/CL/kernels/CLTransposeKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLTransposeKernel.h b/src/core/CL/kernels/CLTransposeKernel.h
new file mode 100644
index 0000000..0c4b7b4
--- /dev/null
+++ b/src/core/CL/kernels/CLTransposeKernel.h

@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLTRANSPOSEKERNEL_H
+#define ARM_COMPUTE_CLTRANSPOSEKERNEL_H
+
+#include "src/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel which transposes the elements of a matrix.
+ *
+ * [width, height, batch] -> [height, width, batch]
+ *
+ */
+class CLTransposeKernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor. Data types supported: All.
+     * @param[out] output Output tensor. Data type supported: Same as @p input
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor. Data types supported: All.
+     * @param[out] output          Output tensor. Data type supported: Same as @p input
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLTransposeKernel
+     *
+     * @param[in] input  Input tensor. Data types supported: All.
+     * @param[in] output Output tensor. Data type supported: Same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLTRANSPOSEKERNEL_H */

diff --git a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
index a4fc10f..acb2fbc 100644
--- a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h"
+#include "src/core/CL/kernels/CLUpsampleLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLUpsampleLayerKernel.h b/src/core/CL/kernels/CLUpsampleLayerKernel.h
new file mode 100644
index 0000000..f90ee07
--- /dev/null
+++ b/src/core/CL/kernels/CLUpsampleLayerKernel.h

@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLUPSAMPLELAYERKERNEL_H
+#define ARM_COMPUTE_CLUPSAMPLELAYERKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the UpsampleLayer kernel on OpenCL. */
+class CLUpsampleLayerKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLUpsampleLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLUpsampleLayerKernel(const CLUpsampleLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLUpsampleLayerKernel &operator=(const CLUpsampleLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    CLUpsampleLayerKernel(CLUpsampleLayerKernel &&) = default;
+    /** Default move assignment operator */
+    CLUpsampleLayerKernel &operator=(CLUpsampleLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLUpsampleLayerKernel() = default;
+
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input             Source tensor. Data types supported: All.
+     * @param[out] output            Destination tensor. Data types supported: same as @p input.
+     * @param[in]  info              Contains stride information described in @ref Size2D.
+     * @param[in]  upsampling_policy Defines the policy to fill the intermediate pixels.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const Size2D &info, const InterpolationPolicy upsampling_policy);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context   The compile context to be used.
+     * @param[in]  input             Source tensor. Data types supported: All.
+     * @param[out] output            Destination tensor. Data types supported: same as @p input.
+     * @param[in]  info              Contains stride information described in @ref Size2D.
+     * @param[in]  upsampling_policy Defines the policy to fill the intermediate pixels.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Size2D &info, const InterpolationPolicy upsampling_policy);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLUpsampleLayerKernel
+     *
+     * @param[in] input             Source tensor info. Data types supported: All.
+     * @param[in] output            Destination tensor info. Data types supported: same as @p input.
+     * @param[in] info              Contains  stride information described in @ref Size2D.
+     * @param[in] upsampling_policy Defines the policy to fill the intermediate pixels.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &info, const InterpolationPolicy upsampling_policy);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    Size2D           _info;
+    DataLayout       _data_layout;
+    unsigned int     _num_elems_processed_per_iteration_input_x;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLUPSAMPLELAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLWarpAffineKernel.cpp b/src/core/CL/kernels/CLWarpAffineKernel.cpp
index 95a7c1b..600c67a 100644
--- a/src/core/CL/kernels/CLWarpAffineKernel.cpp
+++ b/src/core/CL/kernels/CLWarpAffineKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
+#include "src/core/CL/kernels/CLWarpAffineKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLWarpAffineKernel.h b/src/core/CL/kernels/CLWarpAffineKernel.h
new file mode 100644
index 0000000..c600ee7
--- /dev/null
+++ b/src/core/CL/kernels/CLWarpAffineKernel.h

@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLWARPAFFINEKERNEL_H
+#define ARM_COMPUTE_CLWARPAFFINEKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the warp affine kernel.*/
+class CLWarpAffineKernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialize the function's source, destination, interpolation policy and border_mode.
+     *
+     * @param[in]  input  Source tensor. Data types supported: U8.
+     * @param[out] output Destination tensor, Data types supported: U8.
+     * @param[in]  matrix The perspective matrix. Must be 2x3 of type float
+     *                    The matrix argument requires 9 values, the last 3 values are ignored.
+     * @param[in]  policy The interpolation type.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy);
+    /** Initialize the function's source, destination, interpolation policy and border_mode.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: U8.
+     * @param[out] output          Destination tensor, Data types supported: U8.
+     * @param[in]  matrix          The perspective matrix. Must be 2x3 of type float
+     *                             The matrix argument requires 9 values, the last 3 values are ignored.
+     * @param[in]  policy          The interpolation type.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLWARPAFFINEKERNEL_H */

diff --git a/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
index 2fe1feb..5f20a0b 100644
--- a/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
+++ b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
+#include "src/core/CL/kernels/CLWarpPerspectiveKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLWarpPerspectiveKernel.h b/src/core/CL/kernels/CLWarpPerspectiveKernel.h
new file mode 100644
index 0000000..dcbe1c5
--- /dev/null
+++ b/src/core/CL/kernels/CLWarpPerspectiveKernel.h

@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLWARPERSPECTIVEKERNEL_H
+#define ARM_COMPUTE_CLWARPERSPECTIVEKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+/** Interface for the warp perspective kernel.*/
+class CLWarpPerspectiveKernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialize the function's source, destination, interpolation policy and border_mode.
+     *
+     * @param[in]  input  Source tensor. Data types supported: U8.
+     * @param[out] output Destination tensor, Data types supported: U8.
+     * @param[in]  matrix The perspective matrix. Must be 3x3 of type float.
+     * @param[in]  policy The interpolation type.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy);
+    /** Initialize the function's source, destination, interpolation policy and border_mode.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: U8.
+     * @param[out] output          Destination tensor, Data types supported: U8.
+     * @param[in]  matrix          The perspective matrix. Must be 3x3 of type float.
+     * @param[in]  policy          The interpolation type.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLWARPERSPECTIVEKERNEL_H */

diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
index c06c2d3..559f47c 100644
--- a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
+++ b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
+#include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"

diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.h b/src/core/CL/kernels/CLWeightsReshapeKernel.h
new file mode 100644
index 0000000..402a604
--- /dev/null
+++ b/src/core/CL/kernels/CLWeightsReshapeKernel.h

@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H
+#define ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+/** OpenCL kernel to perform reshaping on the weights used by convolution and locally connected layer
+ *
+ * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
+ * In combination with the @ref CLIm2ColKernel can transform a convolution to a matrix multiplication.
+ *
+ * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have:
+ * @f[
+ * \left( \begin{array}{ccc}
+ * a000 & a001 & a002 \\
+ * a010 & a011 & a012 \\
+ * a020 & a021 & a022 \\
+ * \end{array} \right)
+ * \left( \begin{array}{ccc}
+ * a100 & a101 & a102 \\
+ * a110 & a111 & a112 \\
+ * a120 & a121 & a122 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccc}
+ * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\
+ * \end{array} \right)
+ * @f]
+ */
+class CLWeightsReshapeKernel : public ICLKernel
+{
+public:
+    /** Constructor.*/
+    CLWeightsReshapeKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWeightsReshapeKernel(const CLWeightsReshapeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWeightsReshapeKernel &operator=(const CLWeightsReshapeKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLWeightsReshapeKernel(CLWeightsReshapeKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLWeightsReshapeKernel &operator=(CLWeightsReshapeKernel &&) = default;
+    /** Default destructor */
+    ~CLWeightsReshapeKernel() = default;
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input      The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
+     *                        and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: All
+     * @param[in]  biases     The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
+     *                        dimensions [OFM, num_patches] if unshared. Data types supported: F16/F32, for quantized types this must be nullptr.
+     *                        @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
+     * @param[out] output     The output tensor. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise.
+     *                        Data types supported: Same as @p input
+     * @param[in]  num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
+     *                        Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
+     */
+    void configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups = 1);
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
+     *                             and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: All
+     * @param[in]  biases          The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
+     *                             dimensions [OFM, num_patches] if unshared. Data types supported: F16/F32, for quantized types this must be nullptr.
+     *                             @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
+     * @param[out] output          The output tensor. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise.
+     *                             Data types supported: Same as @p input
+     * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
+     *                             Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups = 1);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLWeightsReshapeKernel
+     *
+     * @param[in] input      The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
+     *                       and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: All
+     * @param[in] biases     The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
+     *                       dimensions [OFM, num_patches] if unshared. Data types supported: F16/F32, for quantized types this must be nullptr.
+     *                       @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
+     * @param[in] output     The output tensor. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise.
+     *                       Data types supported: Same as @p input
+     * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
+     *                       Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups = 1);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    const ICLTensor *_biases;
+    ICLTensor       *_output;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H */
\ No newline at end of file

diff --git a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
index a7a3463..d6697ba 100644
--- a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
+#include "src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h
new file mode 100644
index 0000000..2af89e1
--- /dev/null
+++ b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h

@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_CLWIDTHCONCATENATE_2TENSORS_KERNEL_H
+#define ARM_COMPUTE_CLWIDTHCONCATENATE_2TENSORS_KERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+/** Interface for the width concatenate kernel of 2 tensors.
+ *  The input1 and input2 tensors will be concatenated into the output tensor.
+ */
+class CLWidthConcatenate2TensorsKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLWidthConcatenate2TensorsKernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWidthConcatenate2TensorsKernel(const CLWidthConcatenate2TensorsKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWidthConcatenate2TensorsKernel &operator=(const CLWidthConcatenate2TensorsKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLWidthConcatenate2TensorsKernel(CLWidthConcatenate2TensorsKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLWidthConcatenate2TensorsKernel &operator=(CLWidthConcatenate2TensorsKernel &&) = default;
+    /** Default destructor */
+    ~CLWidthConcatenate2TensorsKernel() = default;
+    /** Initialise the kernel's input1s and output
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input1          First input tensor. Data types supported: All.
+     * @param[in]  input2          Second input tensor. Data types supported: same as @p input1
+     * @param[out] output          Output tensor. Data types supported: Same as @p input1.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
+    /**  Static function to check if given info will lead to a valid configuration of @ref CLWidthConcatenate2TensorsKernel
+     *
+     * @param[in] input1 First tensor info. Data types supported: All.
+     * @param[in] input2 Second tensor info. Data types supported: same as @p input1
+     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLWIDTHCONCATENATE_2TENSORS_KERNEL_H */

diff --git a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
index 1c8fef2..7ecdd30 100644
--- a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
+#include "src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h
new file mode 100644
index 0000000..0caf871
--- /dev/null
+++ b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h

@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_CLWIDTHCONCATENATE_4TENSORS_KERNEL_H
+#define ARM_COMPUTE_CLWIDTHCONCATENATE_4TENSORS_KERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+/** Interface for the width concatenate kernel of 4 tensors.
+ *  All input tensors will be concatenated into the output tensor.
+ */
+class CLWidthConcatenate4TensorsKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLWidthConcatenate4TensorsKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWidthConcatenate4TensorsKernel(const CLWidthConcatenate4TensorsKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWidthConcatenate4TensorsKernel &operator=(const CLWidthConcatenate4TensorsKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLWidthConcatenate4TensorsKernel(CLWidthConcatenate4TensorsKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLWidthConcatenate4TensorsKernel &operator=(CLWidthConcatenate4TensorsKernel &&) = default;
+    /** Default destructor */
+    ~CLWidthConcatenate4TensorsKernel() = default;
+    /** Initialise the kernel's input1s and output
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input1          First input tensor. Data types supported: All.
+     * @param[in]  input2          Second input tensor. Data types supported: same as @p input1
+     * @param[in]  input3          Third input tensor. Data types supported: same as @p input1
+     * @param[in]  input4          Fourth input tensor. Data types supported: same as @p input1
+     * @param[out] output          Output tensor. Data types supported: Same as @p input1.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *input3, ITensorInfo *input4, ITensorInfo *output);
+    /**  Static function to check if given info will lead to a valid configuration of @ref CLWidthConcatenate4TensorsKernel
+     *
+     * @param[in] input1 First tensor info. Data types supported: All.
+     * @param[in] input2 Second tensor info. Data types supported: same as @p input1
+     * @param[in] input3 Third tensor info. Data types supported: same as @p input1
+     * @param[in] input4 Fourth tensor info. Data types supported: same as @p input1
+     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *input3, const ITensorInfo *input4, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLWIDTHCONCATENATE_4TENSORS_KERNEL_H */

diff --git a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
index a9a601d..30d0a48 100644
--- a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
+#include "src/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.h b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.h
new file mode 100644
index 0000000..09c3f44
--- /dev/null
+++ b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.h

@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_CLWIDTHCONCATENATELAYERKERNEL_H
+#define ARM_COMPUTE_CLWIDTHCONCATENATELAYERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+/** Interface for the width concatenate kernel.
+ *  The input tensor will be concatenated into the output tensor.
+ */
+class CLWidthConcatenateLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLWidthConcatenateLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWidthConcatenateLayerKernel(const CLWidthConcatenateLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWidthConcatenateLayerKernel &operator=(const CLWidthConcatenateLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLWidthConcatenateLayerKernel(CLWidthConcatenateLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLWidthConcatenateLayerKernel &operator=(CLWidthConcatenateLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLWidthConcatenateLayerKernel() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in]     input           Input tensor. Data types supported: All.
+     * @param[in]     width_offset    The offset on the X axis.
+     * @param[in,out] output          Output tensor. Data types supported: Same as @p input.
+     *
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int width_offset, ITensorInfo *output);
+    /**  Static function to check if given info will lead to a valid configuration of @ref CLWidthConcatenateLayerKernel
+     *
+     * @param[in] input        Input tensor info. Data types supported: All.
+     * @param[in] width_offset The offset on the X axis.
+     * @param[in] output       Output tensor info. Data types supported: Same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLWIDTHCONCATENATELAYERKERNEL_H */

diff --git a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
index e2f9ca5..bd45ddb 100644
--- a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h"
+#include "src/core/CL/kernels/CLWinogradFilterTransformKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
@@ -83,7 +83,7 @@
     const unsigned int num_elems_processed_per_iteration_y = input->dimension(1);
     const unsigned int num_elems_read_per_iteration_z      = input->data_layout() == DataLayout::NCHW ? 1 : input->dimension(2);
 
-    Window win            = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y, num_elems_read_per_iteration_z));
+    Window win           = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y, num_elems_read_per_iteration_z));
     Window win_collapsed = win.collapse(win, Window::DimZ);
     return std::make_pair(Status{}, win_collapsed);
 }

diff --git a/src/core/CL/kernels/CLWinogradFilterTransformKernel.h b/src/core/CL/kernels/CLWinogradFilterTransformKernel.h
new file mode 100644
index 0000000..d22fede
--- /dev/null
+++ b/src/core/CL/kernels/CLWinogradFilterTransformKernel.h

@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLWINOGRADFILTERTRANSFORMKERNEL_H
+#define ARM_COMPUTE_CLWINOGRADFILTERTRANSFORMKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the Winograd filter transform kernel. */
+class CLWinogradFilterTransformKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLWinogradFilterTransformKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWinogradFilterTransformKernel(const CLWinogradFilterTransformKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWinogradFilterTransformKernel &operator=(const CLWinogradFilterTransformKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLWinogradFilterTransformKernel(CLWinogradFilterTransformKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLWinogradFilterTransformKernel &operator=(CLWinogradFilterTransformKernel &&) = default;
+    /** Default destructor */
+    ~CLWinogradFilterTransformKernel() = default;
+    /** Set the input and output tensor.
+     *
+     * @note Winograd filter transform supports the following configurations for NCWH data layout
+     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
+     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+     *
+     * @note Winograd filter transform supports the following configurations for NHWC data layout
+     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+     *
+     *       Strides: only unit strides
+     *
+     * @param[in]  input         Source tensor. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout) or [IFM, kernel_x, kernel_y, OFM] (NHWC data layout). Data types supported: F16/F32.
+     * @param[out] output        The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input
+     * @param[in]  winograd_info Contains Winograd's information described in @ref WinogradInfo
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info);
+    /** Set the input and output tensor.
+     *
+     * @note Winograd filter transform supports the following configurations for NCWH data layout
+     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
+     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+     *
+     * @note Winograd filter transform supports the following configurations for NHWC data layout
+     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+     *
+     *       Strides: only unit strides
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout) or [IFM, kernel_x, kernel_y, OFM] (NHWC data layout). Data types supported: F16/F32.
+     * @param[out] output          The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input
+     * @param[in]  winograd_info   Contains Winograd's information described in @ref WinogradInfo
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradFilterTransformKernel
+     *
+     * @note Winograd filter transform supports the following configurations for NCWH data layout
+     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
+     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+     *
+     * @note Winograd filter transform supports the following configurations for NHWC data layout
+     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+     *
+     *       Strides: only unit strides
+     *
+     * @param[in]  input         Source tensor. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout) or [IFM, kernel_x, kernel_y, OFM] (NHWC data layout). Data types supported: F16/F32.
+     * @param[out] output        The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input
+     * @param[in]  winograd_info Contains Winograd's information described in @ref WinogradInfo
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLWINOGRADFILTERTRANSFORMKERNEL_H */

diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
index 15c239e..6f695c9 100644
--- a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h"
+#include "src/core/CL/kernels/CLWinogradInputTransformKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.h b/src/core/CL/kernels/CLWinogradInputTransformKernel.h
new file mode 100644
index 0000000..2530187
--- /dev/null
+++ b/src/core/CL/kernels/CLWinogradInputTransformKernel.h

@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLWINOGRADINPUTTRANSFORMKERNEL_H
+#define ARM_COMPUTE_CLWINOGRADINPUTTRANSFORMKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform Winograd input transform.*/
+class CLWinogradInputTransformKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLWinogradInputTransformKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWinogradInputTransformKernel(const CLWinogradInputTransformKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWinogradInputTransformKernel &operator=(const CLWinogradInputTransformKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLWinogradInputTransformKernel(CLWinogradInputTransformKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLWinogradInputTransformKernel &operator=(CLWinogradInputTransformKernel &&) = default;
+    /** Set the input and output of the kernel.
+     *
+     * @note Winograd input transform supports the following configurations for NCWH data layout
+     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
+     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+     *
+     * @note Winograd input transform supports the following configurations for NHWC data layout
+     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+     *
+     *       Strides: only unit strides
+     *
+     * @param[in] input         The input tensor to transform. Data types supported: F16/F32
+     * @param[in] output        The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input
+     * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info);
+    /** Set the input and output of the kernel.
+     *
+     * @note Winograd input transform supports the following configurations for NCWH data layout
+     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
+     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+     *
+     * @note Winograd input transform supports the following configurations for NHWC data layout
+     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+     *
+     *       Strides: only unit strides
+     *
+     * @param[in] compile_context The compile context to be used.
+     * @param[in] input           The input tensor to transform. Data types supported: F16/F32
+     * @param[in] output          The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input
+     * @param[in] winograd_info   Contains Winograd's information described in @ref WinogradInfo.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradInputTransformKernel
+     *
+     * @note Winograd input transform supports the following configurations for NCWH data layout
+     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
+     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+     *
+     * @note Winograd input transform supports the following configurations for NHWC data layout
+     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+     *
+     *       Strides: only unit strides
+     *
+     * @param[in] input         The input tensor to transform. Data types supported: F16/F32
+     * @param[in] output        The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input
+     * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>;
+
+    BorderSize       _border_size;
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    DataLayout       _data_layout;
+    int              _num_tiles_x;
+    int              _num_tiles_y;
+    unsigned int     _step_z;
+};
+} // arm_compute
+#endif /*ARM_COMPUTE_CLWINOGRADINPUTTRANSFORMKERNEL_H */

diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
index 89a5176..2018559 100644
--- a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h"
+#include "src/core/CL/kernels/CLWinogradOutputTransformKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"

diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.h b/src/core/CL/kernels/CLWinogradOutputTransformKernel.h
new file mode 100644
index 0000000..632a562
--- /dev/null
+++ b/src/core/CL/kernels/CLWinogradOutputTransformKernel.h

@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLWINOGRADOUTPUTTRANSFORMKERNEL_H
+#define ARM_COMPUTE_CLWINOGRADOUTPUTTRANSFORMKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the Winograd output transform kernel. */
+class CLWinogradOutputTransformKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLWinogradOutputTransformKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWinogradOutputTransformKernel(const CLWinogradOutputTransformKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWinogradOutputTransformKernel &operator=(const CLWinogradOutputTransformKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLWinogradOutputTransformKernel(CLWinogradOutputTransformKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLWinogradOutputTransformKernel &operator=(CLWinogradOutputTransformKernel &&) = default;
+    /** Default destructor */
+    ~CLWinogradOutputTransformKernel() = default;
+    /** Set the input and output tensor.
+     *
+     * @note Winograd output transform supports the following configurations for NCWH data layout
+     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
+     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+     *
+     * @note Winograd output transform supports the following configurations for NHWC data layout
+     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+     *
+     *       Strides: only unit strides
+     *
+     * @param[in]  input         Source tensor with shape [C, N, K, batches]. Data types supported: F16/F32.
+     * @param[in]  bias          Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input
+     * @param[out] output        The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_output_transform_shape. Data types supported: Same as @p input
+     * @param[in]  winograd_info Contains Winograd's information described in @ref WinogradInfo
+     * @param[in]  act_info      (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Set the input and output tensor.
+     *
+     * @note Winograd output transform supports the following configurations for NCWH data layout
+     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
+     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+     *
+     * @note Winograd output transform supports the following configurations for NHWC data layout
+     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+     *
+     *       Strides: only unit strides
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor with shape [C, N, K, batches]. Data types supported: F16/F32.
+     * @param[in]  bias            Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input
+     * @param[out] output          The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_output_transform_shape. Data types supported: Same as @p input
+     * @param[in]  winograd_info   Contains Winograd's information described in @ref WinogradInfo
+     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const WinogradInfo &winograd_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradOutputTransformKernel
+     *
+     * @note Winograd output transform supports the following configurations for NCWH data layout
+     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
+     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+     *
+     * @note Winograd output transform supports the following configurations for NHWC data layout
+     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+     *
+     *       Strides: only unit strides
+     *
+     * @param[in]  input         Source tensor with shape [C, N, K, batches]. Data types supported: F16/F32.
+     * @param[in]  bias          Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input
+     * @param[out] output        The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_output_transform_shape. Data types supported: Same as @p input
+     * @param[in]  winograd_info Contains Winograd's information described in @ref WinogradInfo
+     * @param[in]  act_info      (Optional) Activation layer information in case of a fused activation @ref ActivationLayerInfo. Only RELU, BOUNDED_RELU, LU_BOUNDED_RELU, LEAKY_RELU and SOFT_RELU supported.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>;
+
+    const ICLTensor *_input;
+    const ICLTensor *_bias;
+    ICLTensor       *_output;
+    bool             _is_nhwc;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLWINOGRADOUTPUTTRANSFORMKERNEL_H */

diff --git a/src/core/CL/kernels/CLYOLOLayerKernel.cpp b/src/core/CL/kernels/CLYOLOLayerKernel.cpp
index 0c7588d..e12d1e7 100644
--- a/src/core/CL/kernels/CLYOLOLayerKernel.cpp
+++ b/src/core/CL/kernels/CLYOLOLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLYOLOLayerKernel.h"
+#include "src/core/CL/kernels/CLYOLOLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLHelpers.h"

diff --git a/src/core/CL/kernels/CLYOLOLayerKernel.h b/src/core/CL/kernels/CLYOLOLayerKernel.h
new file mode 100644
index 0000000..5b1d56e
--- /dev/null
+++ b/src/core/CL/kernels/CLYOLOLayerKernel.h

@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLYOLOLAYERKERNEL_H
+#define ARM_COMPUTE_CLYOLOLAYERKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the YOLO layer kernel that performs partial activation.
+ *  For each box, activate only:
+ *    - x and y position (channel 0 and 1 of each box)
+ *    - objectiveness    (channel 4 of each box)
+ *    - classes          (channel 5 to (classes - 5) of each box)
+ */
+class CLYOLOLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLYOLOLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLYOLOLayerKernel(const CLYOLOLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLYOLOLayerKernel &operator=(const CLYOLOLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLYOLOLayerKernel(CLYOLOLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLYOLOLayerKernel &operator=(CLYOLOLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLYOLOLayerKernel() = default;
+    /** Set the input and output tensor.
+     *
+     * @note If the output tensor is a nullptr, the activation function will be performed in-place
+     *
+     * @param[in, out] input       Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
+     *                             of the activation function. Data types supported: F16/F32.
+     * @param[out]     output      Destination tensor. Data type supported: same as @p input
+     * @param[in]      act_info    Activation layer information.
+     * @param[in]      num_classes Number of classes to activate (must be submultiple of @p input channels)
+     */
+    void configure(ICLTensor *input, ICLTensor *output, const ActivationLayerInfo &act_info, int32_t num_classes);
+    /** Set the input and output tensor.
+     *
+     * @note If the output tensor is a nullptr, the activation function will be performed in-place
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] input           Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
+     *                                 of the activation function. Data types supported: F16/F32.
+     * @param[out]     output          Destination tensor. Data type supported: same as @p input
+     * @param[in]      act_info        Activation layer information.
+     * @param[in]      num_classes     Number of classes to activate (must be submultiple of @p input channels)
+     */
+    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ActivationLayerInfo &act_info, int32_t num_classes);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLYOLOLayerKernel
+     *
+     * @param[in] input       Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
+     *                        of the activation function. Data types supported: F16/F32.
+     * @param[in] output      Destination tensor info. Data type supported: same as @p input
+     * @param[in] act_info    Activation layer information.
+     * @param[in] num_classes Number of classes to activate (must be submultiple of @p input channels)
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    ICLTensor *_input;
+    ICLTensor *_output;
+    bool       _run_in_place;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLYOLOLAYERKERNEL_H */

diff --git a/src/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h b/src/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h
new file mode 100644
index 0000000..4c92ae4
--- /dev/null
+++ b/src/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h

@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ICLDEPTHWISECONVOLUTIONKERNEL3x3_H
+#define ARM_COMPUTE_ICLDEPTHWISECONVOLUTIONKERNEL3x3_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to run a 3x3 depthwise convolution on a tensor.
+ */
+class ICLDepthwiseConvolutionLayer3x3Kernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    ICLDepthwiseConvolutionLayer3x3Kernel()
+        : _border_size(0), _input(), _output(), _weights(), _biases(), _conv_stride_y(1), _output_multipliers(), _output_shifts(), _is_quantized(false)
+    {
+    }
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ICLDepthwiseConvolutionLayer3x3Kernel(const ICLDepthwiseConvolutionLayer3x3Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ICLDepthwiseConvolutionLayer3x3Kernel &operator=(const ICLDepthwiseConvolutionLayer3x3Kernel &) = delete;
+    /** Default Move Constructor. */
+    ICLDepthwiseConvolutionLayer3x3Kernel(ICLDepthwiseConvolutionLayer3x3Kernel &&) = default;
+    /** Default move assignment operator */
+    ICLDepthwiseConvolutionLayer3x3Kernel &operator=(ICLDepthwiseConvolutionLayer3x3Kernel &&) = default;
+    /** Initialize the function's source, destination, conv and border_size.
+     *
+     * @param[in]  input              Source tensor. DataType supported: QASYMM8/F16/F32.
+     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [3, 3, IFM].
+     *                                Data type supported: Same as @p input, QASYMM8/QSYMM8_PER_CHANNEL when input is QASYMM8.
+     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                                Data type supported: Same as @p input, S32 when input is QASYMM8.
+     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  conv_info          Padding and stride information to use for the convolution.
+     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported for QASYMM8.
+     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
+     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
+     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     */
+    virtual void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+                           unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
+                           const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) = 0;
+    /** Initialize the function's source, destination, conv and border_size.
+     *
+     * @param[in]  compile_context    The compile context to be used.
+     * @param[in]  input              Source tensor. DataType supported: QASYMM8/F16/F32.
+     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [3, 3, IFM].
+     *                                Data type supported: Same as @p input, QASYMM8/QSYMM8_PER_CHANNEL when input is QASYMM8.
+     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                                Data type supported: Same as @p input, S32 when input is QASYMM8.
+     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  conv_info          Padding and stride information to use for the convolution.
+     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported for QASYMM8.
+     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
+     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
+     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     */
+    virtual void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+                           unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
+                           const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) = 0;
+
+protected:
+    BorderSize       _border_size;
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    const ICLTensor *_weights;
+    const ICLTensor *_biases;
+    unsigned int     _conv_stride_y;
+    const ICLTensor *_output_multipliers;
+    const ICLTensor *_output_shifts;
+    bool             _is_quantized;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_ICLDEPTHWISECONVOLUTIONKERNEL3x3_H */
commit	bef7fa27b0d231a8649952f60808132d109b6345	[log] [tgz]
author	Sang-Hoon Park <sang-hoon.park@arm.com>	Wed Oct 21 15:58:54 2020 +0100
committer	Sang-Hoon Park <sang-hoon.park@arm.com>	Sat Nov 07 08:07:22 2020 +0000
tree	7543c66a473d90e28b4860986fad77afa5115043
parent	b9531540dadce8331a703c32456f3c9defdfefa9 [diff]