Remove Compute Vision CL support Resolves COMPMID-4151 Change-Id: I46f541efe8c4087f27794d2e158b6c1547d459ba Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5160 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>

commit: 473cb01e84cef6cab057e9492bfa3b68f708e5d7 [log] [tgz]
author: Michalis Spyrou <michalis.spyrou@arm.com> Tue Feb 23 11:48:12 2021 +0000
committer: Michalis Spyrou <michalis.spyrou@arm.com> Wed Mar 03 15:04:20 2021 +0000
tree: a500b8a8afe6a0442e1a54fb8d52c77d22543bcb
parent: f466d75f85938b96dd14675ec091193bdce12122 [diff]
diff --git a/Android.bp b/Android.bp
index c5980c3..a51b91b 100644
--- a/Android.bp
+++ b/Android.bp

@@ -82,23 +82,15 @@
         "src/core/CL/gemm/reshaped/CLGEMMDefaultConfigReshapedValhall.cpp",
         "src/core/CL/gemm/reshaped_only_rhs/CLGEMMDefaultConfigReshapedRHSOnlyBifrost.cpp",
         "src/core/CL/gemm/reshaped_only_rhs/CLGEMMDefaultConfigReshapedRHSOnlyValhall.cpp",
-        "src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp",
-        "src/core/CL/kernels/CLAccumulateKernel.cpp",
         "src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp",
         "src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp",
         "src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp",
         "src/core/CL/kernels/CLBitwiseKernel.cpp",
         "src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp",
-        "src/core/CL/kernels/CLBox3x3Kernel.cpp",
-        "src/core/CL/kernels/CLCannyEdgeKernel.cpp",
-        "src/core/CL/kernels/CLChannelCombineKernel.cpp",
-        "src/core/CL/kernels/CLChannelExtractKernel.cpp",
         "src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp",
         "src/core/CL/kernels/CLCol2ImKernel.cpp",
-        "src/core/CL/kernels/CLColorConvertKernel.cpp",
         "src/core/CL/kernels/CLComparisonKernel.cpp",
         "src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp",
-        "src/core/CL/kernels/CLConvolutionKernel.cpp",
         "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp",
         "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp",
         "src/core/CL/kernels/CLDepthConvertLayerKernel.cpp",
@@ -108,14 +100,10 @@
         "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp",
         "src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp",
         "src/core/CL/kernels/CLDequantizationLayerKernel.cpp",
-        "src/core/CL/kernels/CLDerivativeKernel.cpp",
-        "src/core/CL/kernels/CLDilateKernel.cpp",
         "src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp",
-        "src/core/CL/kernels/CLErodeKernel.cpp",
         "src/core/CL/kernels/CLFFTDigitReverseKernel.cpp",
         "src/core/CL/kernels/CLFFTRadixStageKernel.cpp",
         "src/core/CL/kernels/CLFFTScaleKernel.cpp",
-        "src/core/CL/kernels/CLFastCornersKernel.cpp",
         "src/core/CL/kernels/CLFillBorderKernel.cpp",
         "src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp",
         "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp",
@@ -134,28 +122,14 @@
         "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp",
         "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp",
         "src/core/CL/kernels/CLGatherKernel.cpp",
-        "src/core/CL/kernels/CLGaussian3x3Kernel.cpp",
-        "src/core/CL/kernels/CLGaussian5x5Kernel.cpp",
-        "src/core/CL/kernels/CLGaussianPyramidKernel.cpp",
         "src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp",
-        "src/core/CL/kernels/CLHOGDescriptorKernel.cpp",
-        "src/core/CL/kernels/CLHOGDetectorKernel.cpp",
-        "src/core/CL/kernels/CLHarrisCornersKernel.cpp",
-        "src/core/CL/kernels/CLHistogramKernel.cpp",
         "src/core/CL/kernels/CLIm2ColKernel.cpp",
         "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp",
-        "src/core/CL/kernels/CLIntegralImageKernel.cpp",
         "src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp",
         "src/core/CL/kernels/CLLKTrackerKernel.cpp",
-        "src/core/CL/kernels/CLMagnitudePhaseKernel.cpp",
         "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp",
-        "src/core/CL/kernels/CLMeanStdDevKernel.cpp",
         "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp",
-        "src/core/CL/kernels/CLMedian3x3Kernel.cpp",
         "src/core/CL/kernels/CLMinMaxLayerKernel.cpp",
-        "src/core/CL/kernels/CLMinMaxLocationKernel.cpp",
-        "src/core/CL/kernels/CLNonLinearFilterKernel.cpp",
-        "src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp",
         "src/core/CL/kernels/CLNormalizationLayerKernel.cpp",
         "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp",
         "src/core/CL/kernels/CLPadLayerKernel.cpp",
@@ -171,22 +145,14 @@
         "src/core/CL/kernels/CLReorgLayerKernel.cpp",
         "src/core/CL/kernels/CLReverseKernel.cpp",
         "src/core/CL/kernels/CLScaleKernel.cpp",
-        "src/core/CL/kernels/CLScharr3x3Kernel.cpp",
         "src/core/CL/kernels/CLSelectKernel.cpp",
-        "src/core/CL/kernels/CLSobel3x3Kernel.cpp",
-        "src/core/CL/kernels/CLSobel5x5Kernel.cpp",
-        "src/core/CL/kernels/CLSobel7x7Kernel.cpp",
         "src/core/CL/kernels/CLSoftmaxLayerKernel.cpp",
         "src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp",
         "src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp",
         "src/core/CL/kernels/CLStackLayerKernel.cpp",
         "src/core/CL/kernels/CLStridedSliceKernel.cpp",
-        "src/core/CL/kernels/CLTableLookupKernel.cpp",
-        "src/core/CL/kernels/CLThresholdKernel.cpp",
         "src/core/CL/kernels/CLTileKernel.cpp",
         "src/core/CL/kernels/CLTransposeKernel.cpp",
-        "src/core/CL/kernels/CLWarpAffineKernel.cpp",
-        "src/core/CL/kernels/CLWarpPerspectiveKernel.cpp",
         "src/core/CL/kernels/CLWeightsReshapeKernel.cpp",
         "src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp",
         "src/core/CL/kernels/CLWinogradInputTransformKernel.cpp",
@@ -223,7 +189,6 @@
         "src/core/NEON/kernels/NECol2ImKernel.cpp",
         "src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp",
         "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp",
-        "src/core/NEON/kernels/NEConvolutionKernel.cpp",
         "src/core/NEON/kernels/NECropKernel.cpp",
         "src/core/NEON/kernels/NECumulativeDistributionKernel.cpp",
         "src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp",
@@ -260,7 +225,6 @@
         "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp",
         "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp",
         "src/core/NEON/kernels/NEMinMaxLayerKernel.cpp",
-        "src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp",
         "src/core/NEON/kernels/NENormalizationLayerKernel.cpp",
         "src/core/NEON/kernels/NEPadLayerKernel.cpp",
         "src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp",
@@ -271,6 +235,7 @@
         "src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp",
         "src/core/NEON/kernels/NERangeKernel.cpp",
         "src/core/NEON/kernels/NEReductionOperationKernel.cpp",
+        "src/core/NEON/kernels/NERemapKernel.cpp",
         "src/core/NEON/kernels/NEReorgLayerKernel.cpp",
         "src/core/NEON/kernels/NEReverseKernel.cpp",
         "src/core/NEON/kernels/NEScaleKernel.cpp",
@@ -463,8 +428,6 @@
         "src/runtime/CL/CLTuner.cpp",
         "src/runtime/CL/ICLSimpleFunction.cpp",
         "src/runtime/CL/Utils.cpp",
-        "src/runtime/CL/functions/CLAbsoluteDifference.cpp",
-        "src/runtime/CL/functions/CLAccumulate.cpp",
         "src/runtime/CL/functions/CLActivationLayer.cpp",
         "src/runtime/CL/functions/CLArgMinMaxLayer.cpp",
         "src/runtime/CL/functions/CLBatchNormalizationLayer.cpp",
@@ -474,17 +437,11 @@
         "src/runtime/CL/functions/CLBitwiseOr.cpp",
         "src/runtime/CL/functions/CLBitwiseXor.cpp",
         "src/runtime/CL/functions/CLBoundingBoxTransform.cpp",
-        "src/runtime/CL/functions/CLBox3x3.cpp",
-        "src/runtime/CL/functions/CLCannyEdge.cpp",
         "src/runtime/CL/functions/CLCast.cpp",
-        "src/runtime/CL/functions/CLChannelCombine.cpp",
-        "src/runtime/CL/functions/CLChannelExtract.cpp",
         "src/runtime/CL/functions/CLChannelShuffleLayer.cpp",
-        "src/runtime/CL/functions/CLColorConvert.cpp",
         "src/runtime/CL/functions/CLComparison.cpp",
         "src/runtime/CL/functions/CLConcatenateLayer.cpp",
         "src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp",
-        "src/runtime/CL/functions/CLConvolution.cpp",
         "src/runtime/CL/functions/CLConvolutionLayer.cpp",
         "src/runtime/CL/functions/CLCopy.cpp",
         "src/runtime/CL/functions/CLCrop.cpp",
@@ -495,18 +452,13 @@
         "src/runtime/CL/functions/CLDepthToSpaceLayer.cpp",
         "src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp",
         "src/runtime/CL/functions/CLDequantizationLayer.cpp",
-        "src/runtime/CL/functions/CLDerivative.cpp",
-        "src/runtime/CL/functions/CLDilate.cpp",
         "src/runtime/CL/functions/CLDirectConvolutionLayer.cpp",
         "src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp",
         "src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp",
         "src/runtime/CL/functions/CLElementwiseOperations.cpp",
-        "src/runtime/CL/functions/CLEqualizeHistogram.cpp",
-        "src/runtime/CL/functions/CLErode.cpp",
         "src/runtime/CL/functions/CLFFT1D.cpp",
         "src/runtime/CL/functions/CLFFT2D.cpp",
         "src/runtime/CL/functions/CLFFTConvolutionLayer.cpp",
-        "src/runtime/CL/functions/CLFastCorners.cpp",
         "src/runtime/CL/functions/CLFill.cpp",
         "src/runtime/CL/functions/CLFillBorder.cpp",
         "src/runtime/CL/functions/CLFlattenLayer.cpp",
@@ -519,41 +471,21 @@
         "src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp",
         "src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp",
         "src/runtime/CL/functions/CLGather.cpp",
-        "src/runtime/CL/functions/CLGaussian3x3.cpp",
-        "src/runtime/CL/functions/CLGaussian5x5.cpp",
-        "src/runtime/CL/functions/CLGaussianPyramid.cpp",
         "src/runtime/CL/functions/CLGenerateProposalsLayer.cpp",
-        "src/runtime/CL/functions/CLHOGDescriptor.cpp",
-        "src/runtime/CL/functions/CLHOGDetector.cpp",
-        "src/runtime/CL/functions/CLHOGGradient.cpp",
-        "src/runtime/CL/functions/CLHOGMultiDetection.cpp",
-        "src/runtime/CL/functions/CLHarrisCorners.cpp",
-        "src/runtime/CL/functions/CLHistogram.cpp",
         "src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp",
-        "src/runtime/CL/functions/CLIntegralImage.cpp",
         "src/runtime/CL/functions/CLL2NormalizeLayer.cpp",
         "src/runtime/CL/functions/CLLSTMLayer.cpp",
         "src/runtime/CL/functions/CLLSTMLayerQuantized.cpp",
-        "src/runtime/CL/functions/CLLaplacianPyramid.cpp",
-        "src/runtime/CL/functions/CLLaplacianReconstruct.cpp",
         "src/runtime/CL/functions/CLLogicalAnd.cpp",
         "src/runtime/CL/functions/CLLogicalNot.cpp",
         "src/runtime/CL/functions/CLLogicalOr.cpp",
-        "src/runtime/CL/functions/CLMagnitude.cpp",
         "src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp",
-        "src/runtime/CL/functions/CLMeanStdDev.cpp",
         "src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp",
-        "src/runtime/CL/functions/CLMedian3x3.cpp",
-        "src/runtime/CL/functions/CLMinMaxLocation.cpp",
-        "src/runtime/CL/functions/CLNonLinearFilter.cpp",
-        "src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp",
         "src/runtime/CL/functions/CLNormalizationLayer.cpp",
         "src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp",
-        "src/runtime/CL/functions/CLOpticalFlow.cpp",
         "src/runtime/CL/functions/CLPReluLayer.cpp",
         "src/runtime/CL/functions/CLPadLayer.cpp",
         "src/runtime/CL/functions/CLPermute.cpp",
-        "src/runtime/CL/functions/CLPhase.cpp",
         "src/runtime/CL/functions/CLPixelWiseMultiplication.cpp",
         "src/runtime/CL/functions/CLPoolingLayer.cpp",
         "src/runtime/CL/functions/CLPriorBoxLayer.cpp",
@@ -570,25 +502,17 @@
         "src/runtime/CL/functions/CLReshapeLayer.cpp",
         "src/runtime/CL/functions/CLReverse.cpp",
         "src/runtime/CL/functions/CLScale.cpp",
-        "src/runtime/CL/functions/CLScharr3x3.cpp",
         "src/runtime/CL/functions/CLSelect.cpp",
         "src/runtime/CL/functions/CLSlice.cpp",
-        "src/runtime/CL/functions/CLSobel3x3.cpp",
-        "src/runtime/CL/functions/CLSobel5x5.cpp",
-        "src/runtime/CL/functions/CLSobel7x7.cpp",
         "src/runtime/CL/functions/CLSoftmaxLayer.cpp",
         "src/runtime/CL/functions/CLSpaceToBatchLayer.cpp",
         "src/runtime/CL/functions/CLSpaceToDepthLayer.cpp",
         "src/runtime/CL/functions/CLSplit.cpp",
         "src/runtime/CL/functions/CLStackLayer.cpp",
         "src/runtime/CL/functions/CLStridedSlice.cpp",
-        "src/runtime/CL/functions/CLTableLookup.cpp",
-        "src/runtime/CL/functions/CLThreshold.cpp",
         "src/runtime/CL/functions/CLTile.cpp",
         "src/runtime/CL/functions/CLTranspose.cpp",
         "src/runtime/CL/functions/CLUnstack.cpp",
-        "src/runtime/CL/functions/CLWarpAffine.cpp",
-        "src/runtime/CL/functions/CLWarpPerspective.cpp",
         "src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp",
         "src/runtime/CL/functions/CLWinogradInputTransform.cpp",
         "src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp",
@@ -646,7 +570,6 @@
         "src/runtime/NEON/functions/NEChannelShuffleLayer.cpp",
         "src/runtime/NEON/functions/NEConcatenateLayer.cpp",
         "src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp",
-        "src/runtime/NEON/functions/NEConvolution.cpp",
         "src/runtime/NEON/functions/NEConvolutionLayer.cpp",
         "src/runtime/NEON/functions/NECopy.cpp",
         "src/runtime/NEON/functions/NECropResize.cpp",
@@ -683,7 +606,6 @@
         "src/runtime/NEON/functions/NELogical.cpp",
         "src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp",
         "src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp",
-        "src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp",
         "src/runtime/NEON/functions/NENormalizationLayer.cpp",
         "src/runtime/NEON/functions/NEPReluLayer.cpp",
         "src/runtime/NEON/functions/NEPadLayer.cpp",
@@ -699,6 +621,7 @@
         "src/runtime/NEON/functions/NERange.cpp",
         "src/runtime/NEON/functions/NEReduceMean.cpp",
         "src/runtime/NEON/functions/NEReductionOperation.cpp",
+        "src/runtime/NEON/functions/NERemap.cpp",
         "src/runtime/NEON/functions/NEReorgLayer.cpp",
         "src/runtime/NEON/functions/NEReshapeLayer.cpp",
         "src/runtime/NEON/functions/NEReverse.cpp",

diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h
index b2bdb9a..01b61c8 100644
--- a/arm_compute/runtime/CL/CLFunctions.h
+++ b/arm_compute/runtime/CL/CLFunctions.h

@@ -25,8 +25,6 @@
 #define ARM_COMPUTE_CLFUNCTIONS_H
 
 /* Header regrouping all the CL functions */
-#include "arm_compute/runtime/CL/functions/CLAbsoluteDifference.h"
-#include "arm_compute/runtime/CL/functions/CLAccumulate.h"
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h"
 #include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h"
@@ -36,17 +34,11 @@
 #include "arm_compute/runtime/CL/functions/CLBitwiseOr.h"
 #include "arm_compute/runtime/CL/functions/CLBitwiseXor.h"
 #include "arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h"
-#include "arm_compute/runtime/CL/functions/CLBox3x3.h"
-#include "arm_compute/runtime/CL/functions/CLCannyEdge.h"
 #include "arm_compute/runtime/CL/functions/CLCast.h"
-#include "arm_compute/runtime/CL/functions/CLChannelCombine.h"
-#include "arm_compute/runtime/CL/functions/CLChannelExtract.h"
 #include "arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h"
-#include "arm_compute/runtime/CL/functions/CLColorConvert.h"
 #include "arm_compute/runtime/CL/functions/CLComparison.h"
 #include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
 #include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h"
-#include "arm_compute/runtime/CL/functions/CLConvolution.h"
 #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLCopy.h"
 #include "arm_compute/runtime/CL/functions/CLCrop.h"
@@ -57,18 +49,13 @@
 #include "arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h"
 #include "arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
-#include "arm_compute/runtime/CL/functions/CLDerivative.h"
-#include "arm_compute/runtime/CL/functions/CLDilate.h"
 #include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
 #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
-#include "arm_compute/runtime/CL/functions/CLEqualizeHistogram.h"
-#include "arm_compute/runtime/CL/functions/CLErode.h"
 #include "arm_compute/runtime/CL/functions/CLFFT1D.h"
 #include "arm_compute/runtime/CL/functions/CLFFT2D.h"
 #include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
-#include "arm_compute/runtime/CL/functions/CLFastCorners.h"
 #include "arm_compute/runtime/CL/functions/CLFill.h"
 #include "arm_compute/runtime/CL/functions/CLFillBorder.h"
 #include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
@@ -81,41 +68,21 @@
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/CL/functions/CLGather.h"
-#include "arm_compute/runtime/CL/functions/CLGaussian3x3.h"
-#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
-#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
 #include "arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h"
-#include "arm_compute/runtime/CL/functions/CLHOGDescriptor.h"
-#include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
-#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
-#include "arm_compute/runtime/CL/functions/CLHOGMultiDetection.h"
-#include "arm_compute/runtime/CL/functions/CLHarrisCorners.h"
-#include "arm_compute/runtime/CL/functions/CLHistogram.h"
 #include "arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h"
-#include "arm_compute/runtime/CL/functions/CLIntegralImage.h"
 #include "arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h"
 #include "arm_compute/runtime/CL/functions/CLLSTMLayer.h"
 #include "arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h"
-#include "arm_compute/runtime/CL/functions/CLLaplacianPyramid.h"
-#include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h"
 #include "arm_compute/runtime/CL/functions/CLLogicalAnd.h"
 #include "arm_compute/runtime/CL/functions/CLLogicalNot.h"
 #include "arm_compute/runtime/CL/functions/CLLogicalOr.h"
-#include "arm_compute/runtime/CL/functions/CLMagnitude.h"
 #include "arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h"
-#include "arm_compute/runtime/CL/functions/CLMeanStdDev.h"
 #include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h"
-#include "arm_compute/runtime/CL/functions/CLMedian3x3.h"
-#include "arm_compute/runtime/CL/functions/CLMinMaxLocation.h"
-#include "arm_compute/runtime/CL/functions/CLNonLinearFilter.h"
-#include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h"
 #include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h"
-#include "arm_compute/runtime/CL/functions/CLOpticalFlow.h"
 #include "arm_compute/runtime/CL/functions/CLPReluLayer.h"
 #include "arm_compute/runtime/CL/functions/CLPadLayer.h"
 #include "arm_compute/runtime/CL/functions/CLPermute.h"
-#include "arm_compute/runtime/CL/functions/CLPhase.h"
 #include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
 #include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
 #include "arm_compute/runtime/CL/functions/CLPriorBoxLayer.h"
@@ -132,25 +99,17 @@
 #include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 #include "arm_compute/runtime/CL/functions/CLReverse.h"
 #include "arm_compute/runtime/CL/functions/CLScale.h"
-#include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
 #include "arm_compute/runtime/CL/functions/CLSelect.h"
 #include "arm_compute/runtime/CL/functions/CLSlice.h"
-#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
-#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
-#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
 #include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
 #include "arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h"
 #include "arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h"
 #include "arm_compute/runtime/CL/functions/CLSplit.h"
 #include "arm_compute/runtime/CL/functions/CLStackLayer.h"
 #include "arm_compute/runtime/CL/functions/CLStridedSlice.h"
-#include "arm_compute/runtime/CL/functions/CLTableLookup.h"
-#include "arm_compute/runtime/CL/functions/CLThreshold.h"
 #include "arm_compute/runtime/CL/functions/CLTile.h"
 #include "arm_compute/runtime/CL/functions/CLTranspose.h"
 #include "arm_compute/runtime/CL/functions/CLUnstack.h"
-#include "arm_compute/runtime/CL/functions/CLWarpAffine.h"
-#include "arm_compute/runtime/CL/functions/CLWarpPerspective.h"
 #include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h"
 

diff --git a/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h b/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h
deleted file mode 100644
index 86c8022..0000000
--- a/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h
+++ /dev/null

@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLABSOLUTEDIFFERENCE_H
-#define ARM_COMPUTE_CLABSOLUTEDIFFERENCE_H
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class CLCompileContext;
-class ICLTensor;
-
-/** Basic function to run @ref CLAbsoluteDifferenceKernel
- *
- * @note The tensor data types for the inputs must be U8 or S16.
- * @note The function calculates the absolute difference also when the 2 inputs have different tensor data types.
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class CLAbsoluteDifference : public ICLSimpleFunction
-{
-public:
-    /** Initialize the function
-     *
-     * @param[in]  input1 First input tensor. Data types supported: U8, S16
-     * @param[in]  input2 Second input tensor. Data types supported: U8, S16
-     * @param[out] output Output tensor. Data types supported: U8, S16
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-    /** Initialize the function
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          First input tensor. Data types supported: U8, S16
-     * @param[in]  input2          Second input tensor. Data types supported: U8, S16
-     * @param[out] output          Output tensor. Data types supported: U8, S16
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-};
-}
-#endif /* ARM_COMPUTE_CLABSOLUTEDIFFERENCE_H */

diff --git a/arm_compute/runtime/CL/functions/CLAccumulate.h b/arm_compute/runtime/CL/functions/CLAccumulate.h
deleted file mode 100644
index f78ce0e..0000000
--- a/arm_compute/runtime/CL/functions/CLAccumulate.h
+++ /dev/null

@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLACCUMULATE_H
-#define ARM_COMPUTE_CLACCUMULATE_H
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class ICLTensor;
-
-/** Basic function to run @ref CLAccumulateKernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
-*/
-class CLAccumulate : public ICLSimpleFunction
-{
-public:
-    /** Set the input and accumulation tensors.
-     *
-     * @param[in]  input Source tensor. Data types supported: U8.
-     * @param[out] accum Destination tensor. Data types supported: S16.
-     */
-    void configure(const ICLTensor *input, ICLTensor *accum);
-    /** Set the input and accumulation tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8.
-     * @param[out] accum           Destination tensor. Data types supported: S16.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *accum);
-};
-
-/** Basic function to run @ref CLAccumulateWeightedKernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
-*/
-class CLAccumulateWeighted : public ICLSimpleFunction
-{
-public:
-    /** Set the input and accumulation tensors, and the scale value.
-     *
-     * @param[in]     input Source tensor. Data types supported: U8.
-     * @param[in]     alpha The input scalar value with a value input the range of [0, 1.0]. Data types supported: F32.
-     * @param[in,out] accum Accumulated tensor. Data types supported: U8.
-     */
-    void configure(const ICLTensor *input, float alpha, ICLTensor *accum);
-    /** Set the input and accumulation tensors, and the scale value.
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Source tensor. Data types supported: U8.
-     * @param[in]     alpha           The input scalar value with a value input the range of [0, 1.0]. Data types supported: F32.
-     * @param[in,out] accum           Accumulated tensor. Data types supported: U8.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, float alpha, ICLTensor *accum);
-};
-
-/** Basic function to run @ref CLAccumulateSquaredKernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
-*/
-class CLAccumulateSquared : public ICLSimpleFunction
-{
-public:
-    /** Set the input and accumulation tensors and the shift value.
-     *
-     * @param[in]     input Source tensor. Data types supported: U8.
-     * @param[in]     shift The input with a value input the range of [0, 15]. Data types supported: U32.
-     * @param[in,out] accum Accumulated tensor. Data types supported: S16.
-     */
-    void configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum);
-    /** Set the input and accumulation tensors and the shift value.
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Source tensor. Data types supported: U8.
-     * @param[in]     shift           The input with a value input the range of [0, 15]. Data types supported: U32.
-     * @param[in,out] accum           Accumulated tensor. Data types supported: S16.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, uint32_t shift, ICLTensor *accum);
-};
-}
-#endif /*ARM_COMPUTE_CLACCUMULATE_H */

diff --git a/arm_compute/runtime/CL/functions/CLBox3x3.h b/arm_compute/runtime/CL/functions/CLBox3x3.h
deleted file mode 100644
index 63c5d3f..0000000
--- a/arm_compute/runtime/CL/functions/CLBox3x3.h
+++ /dev/null

@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLBOX3X3_H
-#define ARM_COMPUTE_CLBOX3X3_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class ICLTensor;
-
-/** Basic function to execute box filter 3x3. This function calls the following OpenCL kernels:
- *
- *  -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- *  -# @ref CLBox3x3Kernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class CLBox3x3 : public ICLSimpleFunction
-{
-public:
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_CLBOX3X3_H */

diff --git a/arm_compute/runtime/CL/functions/CLCannyEdge.h b/arm_compute/runtime/CL/functions/CLCannyEdge.h
deleted file mode 100644
index 1c48d69..0000000
--- a/arm_compute/runtime/CL/functions/CLCannyEdge.h
+++ /dev/null

@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCANNYEDGE_H
-#define ARM_COMPUTE_CLCANNYEDGE_H
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class CLFillBorderKernel;
-class CLGradientKernel;
-class CLEdgeNonMaxSuppressionKernel;
-class CLEdgeTraceKernel;
-class ICLTensor;
-
-/** Basic function to execute canny edge on OpenCL. This function calls the following OpenCL kernels and functions:
- *
- * -# @ref CLFillBorderKernel (if border_mode == REPLICATE or border_mode == CONSTANT)
- * -# @ref CLSobel3x3 (if gradient_size == 3) or @ref CLSobel5x5 (if gradient_size == 5) or @ref CLSobel7x7 (if gradient_size == 7)
- * -# @ref CLGradientKernel
- * -# @ref CLEdgeNonMaxSuppressionKernel
- * -# @ref CLEdgeTraceKernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class CLCannyEdge : public IFunction
-{
-public:
-    /** Constructor */
-    CLCannyEdge(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLCannyEdge(const CLCannyEdge &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLCannyEdge &operator=(const CLCannyEdge &) = delete;
-    /** Default destructor */
-    ~CLCannyEdge();
-    /** Initialise the function's source, destination, thresholds, gradient size, normalization type and border mode.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor. Data types supported: U8.
-     * @param[in]     upper_thr             Upper threshold used for the hysteresis.
-     * @param[in]     lower_thr             Lower threshold used for the hysteresis.
-     * @param[in]     gradient_size         Gradient size (3, 5 or 7).
-     * @param[in]     norm_type             Normalization type. if 1, L1-Norm otherwise L2-Norm.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode,
-                   uint8_t constant_border_value = 0);
-    /** Initialise the function's source, destination, thresholds, gradient size, normalization type and border mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor. Data types supported: U8.
-     * @param[in]     upper_thr             Upper threshold used for the hysteresis.
-     * @param[in]     lower_thr             Lower threshold used for the hysteresis.
-     * @param[in]     gradient_size         Gradient size (3, 5 or 7).
-     * @param[in]     norm_type             Normalization type. if 1, L1-Norm otherwise L2-Norm.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode,
-                   uint8_t constant_border_value = 0);
-
-    // Inherited methods overridden:
-    virtual void run() override;
-
-private:
-    MemoryGroup                                    _memory_group;                                    /**< Function's memory group */
-    std::unique_ptr<IFunction>                     _sobel;                                           /**< Pointer to Sobel kernel. */
-    std::unique_ptr<CLGradientKernel>              _gradient;                                        /**< Gradient kernel. */
-    std::unique_ptr<CLFillBorderKernel>            _border_mag_gradient;                             /**< Fill border on magnitude tensor kernel */
-    std::unique_ptr<CLEdgeNonMaxSuppressionKernel> _non_max_suppr;                                   /**< Non-Maxima suppression kernel. */
-    std::unique_ptr<CLEdgeTraceKernel>             _edge_trace;                                      /**< Edge tracing kernel. */
-    CLImage                                        _gx;                                              /**< Source tensor - Gx component. */
-    CLImage                                        _gy;                                              /**< Source tensor - Gy component. */
-    CLImage                                        _mag;                                             /**< Source tensor - Magnitude. */
-    CLImage                                        _phase;                                           /**< Source tensor - Phase. */
-    CLImage                                        _nonmax;                                          /**< Source tensor - Non-Maxima suppressed. */
-    CLImage                                        _visited, _recorded, _l1_list_counter, _l1_stack; /**< Temporary tensors */
-    ICLTensor                                     *_output;                                          /**< Output tensor provided by the user. */
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CLCANNYEDGE_H */

diff --git a/arm_compute/runtime/CL/functions/CLChannelCombine.h b/arm_compute/runtime/CL/functions/CLChannelCombine.h
deleted file mode 100644
index 2a36d3f..0000000
--- a/arm_compute/runtime/CL/functions/CLChannelCombine.h
+++ /dev/null

@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCHANNELCOMBINE_H
-#define ARM_COMPUTE_CLCHANNELCOMBINE_H
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class CLCompileContext;
-class ICLMultiImage;
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Basic function to run @ref CLChannelCombineKernel to perform channel combination.
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
-*/
-class CLChannelCombine : public ICLSimpleFunction
-{
-public:
-    /** Initialize function's inputs and outputs.
-     *
-     * @param[in]  plane0 The 2D plane that forms channel 0. Must be of U8 format.
-     * @param[in]  plane1 The 2D plane that forms channel 1. Must be of U8 format.
-     * @param[in]  plane2 The 2D plane that forms channel 2. Must be of U8 format.
-     * @param[in]  plane3 The 2D plane that forms channel 3. Must be of U8 format.
-     * @param[out] output The single planar output tensor.
-     */
-    void configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output);
-    /** Initialize function's inputs and outputs.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  plane0          The 2D plane that forms channel 0. Must be of U8 format.
-     * @param[in]  plane1          The 2D plane that forms channel 1. Must be of U8 format.
-     * @param[in]  plane2          The 2D plane that forms channel 2. Must be of U8 format.
-     * @param[in]  plane3          The 2D plane that forms channel 3. Must be of U8 format.
-     * @param[out] output          The single planar output tensor.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output);
-    /** Initialize function's inputs and outputs.
-     *
-     * @param[in]  plane0 The 2D plane that forms channel 0. Must be of U8 format.
-     * @param[in]  plane1 The 2D plane that forms channel 1. Must be of U8 format.
-     * @param[in]  plane2 The 2D plane that forms channel 2. Must be of U8 format.
-     * @param[out] output The multi planar output image.
-     */
-    void configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output);
-    /** Initialize function's inputs and outputs.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  plane0          The 2D plane that forms channel 0. Must be of U8 format.
-     * @param[in]  plane1          The 2D plane that forms channel 1. Must be of U8 format.
-     * @param[in]  plane2          The 2D plane that forms channel 2. Must be of U8 format.
-     * @param[out] output          The multi planar output image.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output);
-};
-}
-#endif /*ARM_COMPUTE_CLCHANNELCOMBINE_H*/

diff --git a/arm_compute/runtime/CL/functions/CLChannelExtract.h b/arm_compute/runtime/CL/functions/CLChannelExtract.h
deleted file mode 100644
index 6cd2464..0000000
--- a/arm_compute/runtime/CL/functions/CLChannelExtract.h
+++ /dev/null

@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCHANNELEXTRACT_H
-#define ARM_COMPUTE_CLCHANNELEXTRACT_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class CLCompileContext;
-class ICLMultiImage;
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Basic function to run @ref CLChannelExtractKernel to perform channel extraction.
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
-*/
-class CLChannelExtract : public ICLSimpleFunction
-{
-public:
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  input   The input tensor to extract the channel from. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422
-     * @param[in]  channel The channel to extract.
-     * @param[out] output  The extracted channel. Must be of U8 format.
-     */
-    void configure(const ICLTensor *input, Channel channel, ICLTensor *output);
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           The input tensor to extract the channel from. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422
-     * @param[in]  channel         The channel to extract.
-     * @param[out] output          The extracted channel. Must be of U8 format.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, Channel channel, ICLTensor *output);
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  input   The multi-planar input image to extract channel from. Formats supported: NV12/NV21/IYUV/YUV444
-     * @param[in]  channel The channel to extract.
-     * @param[out] output  The extracted 2D channel. Must be of U8 format.
-     */
-    void configure(const ICLMultiImage *input, Channel channel, ICLImage *output);
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           The multi-planar input image to extract channel from. Formats supported: NV12/NV21/IYUV/YUV444
-     * @param[in]  channel         The channel to extract.
-     * @param[out] output          The extracted 2D channel. Must be of U8 format.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLMultiImage *input, Channel channel, ICLImage *output);
-};
-}
-#endif /*ARM_COMPUTE_CLCHANNELEXTRACT_H*/

diff --git a/arm_compute/runtime/CL/functions/CLColorConvert.h b/arm_compute/runtime/CL/functions/CLColorConvert.h
deleted file mode 100644
index f30621e..0000000
--- a/arm_compute/runtime/CL/functions/CLColorConvert.h
+++ /dev/null

@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCOLORCONVERT_H
-#define ARM_COMPUTE_CLCOLORCONVERT_H
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class CLCompileContext;
-class ICLMultiImage;
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Basic function to run @ref CLColorConvertKernel
- *
- * @note The function performs color convert between images.
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class CLColorConvert : public ICLSimpleFunction
-{
-public:
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  input  Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888
-     * @param[out] output Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422),
-     *                                                          RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/),
-     *                                                          U8 (if the formats of @p input is RGB888)
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888
-     * @param[out] output          Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422),
-     *                                                          RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/),
-     *                                                          U8 (if the formats of @p input is RGB888)
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output Single-planar destination image. Formats supported: RGB888/RGBA8888
-     */
-    void configure(const ICLMultiImage *input, ICLImage *output);
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output          Single-planar destination image. Formats supported: RGB888/RGBA8888
-     */
-    void configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLImage *output);
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  input  Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
-     * @param[out] output Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888)
-     */
-    void configure(const ICLImage *input, ICLMultiImage *output);
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
-     * @param[out] output          Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888)
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLMultiImage *output);
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of  @p input is IYUV)
-     */
-    void configure(const ICLMultiImage *input, ICLMultiImage *output);
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output          Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of  @p input is IYUV)
-     */
-    void configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLMultiImage *output);
-};
-}
-#endif /* ARM_COMPUTE_CLCOLORCONVERT_H */

diff --git a/arm_compute/runtime/CL/functions/CLConvolution.h b/arm_compute/runtime/CL/functions/CLConvolution.h
deleted file mode 100644
index 4434676..0000000
--- a/arm_compute/runtime/CL/functions/CLConvolution.h
+++ /dev/null

@@ -1,190 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCONVOLUTION_H
-#define ARM_COMPUTE_CLCONVOLUTION_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-template <unsigned int matrix_size>
-class CLConvolutionKernel;
-template <unsigned int matrix_size>
-class CLSeparableConvolutionHorKernel;
-template <unsigned int matrix_size>
-class CLSeparableConvolutionVertKernel;
-class CLFillBorderKernel;
-class ICLTensor;
-
-/** Basic function to execute convolution of size 3x3. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLConvolution3x3Kernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class CLConvolution3x3 : public ICLSimpleFunction
-{
-public:
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
-     * @param[in]     conv                  matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
-     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
-     * @param[in]     conv                  matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
-     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-
-/** Basic function to execute square convolution.Currently it supports 5x5, 7x7, 9x9. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLConvolutionKernel or<br/>
- *    @ref CLSeparableConvolutionHorKernel and @ref CLSeparableConvolutionVertKernel (if convolution matrix is separable)
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-template <unsigned int matrix_size>
-class CLConvolutionSquare : public IFunction
-{
-public:
-    /** Default constructor */
-    CLConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLConvolutionSquare(const CLConvolutionSquare &) = delete;
-    /** Default move constructor */
-    CLConvolutionSquare(CLConvolutionSquare &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLConvolutionSquare &operator=(const CLConvolutionSquare &) = delete;
-    /** Default move assignment operator */
-    CLConvolutionSquare &operator=(CLConvolutionSquare &&) = default;
-    /** Default destructor */
-    ~CLConvolutionSquare();
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
-     * @param[in]     conv                  matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
-     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
-     * @param[in]     conv                  matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
-     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    // Inherited methods overriden:
-    void run() override;
-
-private:
-    MemoryGroup                                                    _memory_group;   /**< Function's memory group */
-    CLTensor                                                       _tmp;            /**< temporary buffer for output of horizontal pass */
-    bool                                                           _is_separable;   /**< true if the convolution can be separated */
-    std::unique_ptr<CLSeparableConvolutionHorKernel<matrix_size>>  _kernel_hor;     /**< kernel for horizontal pass of separated convolution */
-    std::unique_ptr<CLSeparableConvolutionVertKernel<matrix_size>> _kernel_vert;    /**< kernel for vertical pass of separated convolution */
-    std::unique_ptr<CLConvolutionKernel<matrix_size>>              _kernel;         /**< kernel for non-separated convolution **/
-    std::unique_ptr<CLFillBorderKernel>                            _border_handler; /**< kernel for border handling */
-};
-
-/** Basic function to run 5x5 convolution. */
-using CLConvolution5x5 = CLConvolutionSquare<5>;
-/** Basic function to run 7x7 convolution. */
-using CLConvolution7x7 = CLConvolutionSquare<7>;
-/** Basic function to run 9x9 convolution. */
-using CLConvolution9x9 = CLConvolutionSquare<9>;
-
-/** Basic function to execute non-square convolution. This function calls the following CL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLConvolutionRectangleKernel or<br/>
- *
- * @note Convolution rectangle should have dimensions of 3, 5, 7, 9
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class CLConvolutionRectangle : public ICLSimpleFunction
-{
-public:
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
-     * @param[in]     conv                  Matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
-     * @param[in]     rows                  Rows of convolution kernel.
-     * @param[in]     cols                  Columns of convolution kernel.
-     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
-     * @param[in]     conv                  Matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
-     * @param[in]     rows                  Rows of convolution kernel.
-     * @param[in]     cols                  Columns of convolution kernel.
-     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode,
-                   uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_CLCONVOLUTION_H */

diff --git a/arm_compute/runtime/CL/functions/CLDerivative.h b/arm_compute/runtime/CL/functions/CLDerivative.h
deleted file mode 100644
index 8918dac..0000000
--- a/arm_compute/runtime/CL/functions/CLDerivative.h
+++ /dev/null

@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDERIVATIVE_H
-#define ARM_COMPUTE_CLDERIVATIVE_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class ICLTensor;
-
-/** Basic function to execute first order derivative operator. This function calls the following CL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLDerivativeKernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class CLDerivative : public ICLSimpleFunction
-{
-public:
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note At least one of output_x or output_y must be not NULL.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output_x              (optional) Destination tensor. Derivative along the X direction. Data types supported: S16.
-     * @param[out]    output_y              (optional) Destination tensor. Derivative along the Y direction. Data types supported: S16.
-     * @param[in]     border_mode           Border mode to use
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note At least one of output_x or output_y must be not NULL.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output_x              (optional) Destination tensor. Derivative along the X direction. Data types supported: S16.
-     * @param[out]    output_y              (optional) Destination tensor. Derivative along the Y direction. Data types supported: S16.
-     * @param[in]     border_mode           Border mode to use
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /* ARM_COMPUTE_CLDERIVATIVE_H */

diff --git a/arm_compute/runtime/CL/functions/CLDilate.h b/arm_compute/runtime/CL/functions/CLDilate.h
deleted file mode 100644
index e15621b..0000000
--- a/arm_compute/runtime/CL/functions/CLDilate.h
+++ /dev/null

@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDILATE_H
-#define ARM_COMPUTE_CLDILATE_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class ICLTensor;
-
-/** Basic function to execute dilate. This function calls the following OpenCL kernels:
-*
-* -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
-* -# @ref CLDilateKernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
-*/
-class CLDilate : public ICLSimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs, output and border mode.
-     *
-     * @param[in,out] input                 First tensor input. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Output tensor. Data types supported: U8.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the kernel's inputs, output and border mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 First tensor input. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Output tensor. Data types supported: U8.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_CLDILATE_H */

diff --git a/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h b/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h
deleted file mode 100644
index 41479e3..0000000
--- a/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h
+++ /dev/null

@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLEQUALIZEHISTOGRAM_H
-#define ARM_COMPUTE_CLEQUALIZEHISTOGRAM_H
-
-#include "arm_compute/runtime/CL/CLDistribution1D.h"
-#include "arm_compute/runtime/CL/CLLut.h"
-#include "arm_compute/runtime/IFunction.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class CLHistogramKernel;
-class CLHistogramBorderKernel;
-class CLTableLookupKernel;
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Basic function to execute histogram equalization. This function calls the following CL kernels:
- *
- * -# @ref CLHistogramKernel
- * -# @ref CLTableLookupKernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class CLEqualizeHistogram : public IFunction
-{
-public:
-    /** Default Constructor. */
-    CLEqualizeHistogram();
-    /** Prevent instances of this class from being copied */
-    CLEqualizeHistogram(const CLEqualizeHistogram &) = delete;
-    /** Prevent instances of this class from being copied */
-    CLEqualizeHistogram &operator=(const CLEqualizeHistogram &) = delete;
-    /** Default destructor */
-    ~CLEqualizeHistogram();
-    /** Initialise the kernel's inputs.
-     *
-     * @param[in]  input  Input image. Data types supported: U8.
-     * @param[out] output Output of same data type with equalized brightness and contrast.
-     */
-    void configure(const ICLImage *input, ICLImage *output);
-    /** Initialise the kernel's inputs.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input image. Data types supported: U8.
-     * @param[out] output          Output of same data type with equalized brightness and contrast.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLImage *output);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    std::unique_ptr<CLHistogramKernel>       _histogram_kernel;        /**< Kernel that calculates the histogram of input. */
-    std::unique_ptr<CLHistogramBorderKernel> _border_histogram_kernel; /**< Kernel that calculates the histogram on the borders. */
-    std::unique_ptr<CLTableLookupKernel>     _map_histogram_kernel;    /**< Kernel that maps the input to output using the lut. */
-    CLDistribution1D                         _hist;                    /**< Distribution that holds the histogram of the input image. */
-    CLDistribution1D                         _cum_dist;                /**< Distribution that holds the cummulative distribution of the input histogram. */
-    CLLut                                    _cd_lut;                  /**< Holds the equalization lookuptable. */
-    static const uint32_t                    max_range = 256;          /**< Histogram range of the internal histograms. */
-    static const uint32_t                    nr_bins   = 256;          /**< Histogram bins of the internal histograms. */
-};
-}
-#endif /*ARM_COMPUTE_CLEQUALIZEHISTOGRAM_H */

diff --git a/arm_compute/runtime/CL/functions/CLErode.h b/arm_compute/runtime/CL/functions/CLErode.h
deleted file mode 100644
index bd66ed9..0000000
--- a/arm_compute/runtime/CL/functions/CLErode.h
+++ /dev/null

@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLERODE_H
-#define ARM_COMPUTE_CLERODE_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class ICLTensor;
-
-/** Basic function to execute erode. This function calls the following OpenCL kernels:
-*
-* -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
-* -# @ref CLErodeKernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
-*/
-class CLErode : public ICLSimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs, output and border mode
-     *
-     * @param[in,out] input                 First tensor input. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Output tensor. Data types supported: U8.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the kernel's inputs, output and border mode
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 First tensor input. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Output tensor. Data types supported: U8.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_CLERODE_H */

diff --git a/arm_compute/runtime/CL/functions/CLFastCorners.h b/arm_compute/runtime/CL/functions/CLFastCorners.h
deleted file mode 100644
index 608fdf8..0000000
--- a/arm_compute/runtime/CL/functions/CLFastCorners.h
+++ /dev/null

@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLFASTCORNERS_H
-#define ARM_COMPUTE_CLFASTCORNERS_H
-
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/runtime/CL/CLArray.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class CLFastCornersKernel;
-class CLCopyToArrayKernel;
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Basic function to execute fast corners. This function calls the following CL kernels:
- *
- * -# @ref CLFastCornersKernel
- * -# @ref CLNonMaximaSuppression3x3Kernel (executed if nonmax_suppression == true)
- * -# @ref CLCopyToArrayKernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class CLFastCorners : public IFunction
-{
-public:
-    /** Constructor */
-    CLFastCorners(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFastCorners(const CLFastCorners &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    const CLFastCorners &operator=(const CLFastCorners &) = delete;
-    /** Default destructor */
-    ~CLFastCorners();
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in]     input                 Source image. Data types supported: U8.
-     * @param[in]     threshold             Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
-     * @param[in]     nonmax_suppression    If true, non-maximum suppression is applied to detected corners before being placed in the array.
-     * @param[out]    corners               Array of keypoints to store the results.
-     * @param[in,out] num_corners           Record number of corners in the array
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const ICLImage *input, float threshold, bool nonmax_suppression, ICLKeyPointArray *corners, unsigned int *num_corners,
-                   BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in]     input                 Source image. Data types supported: U8.
-     * @param[in]     threshold             Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
-     * @param[in]     nonmax_suppression    If true, non-maximum suppression is applied to detected corners before being placed in the array.
-     * @param[out]    corners               Array of keypoints to store the results.
-     * @param[in,out] num_corners           Record number of corners in the array
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, float threshold, bool nonmax_suppression, ICLKeyPointArray *corners, unsigned int *num_corners,
-                   BorderMode border_mode, uint8_t constant_border_value = 0);
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    MemoryGroup                          _memory_group;
-    std::unique_ptr<CLFastCornersKernel> _fast_corners_kernel;
-    CLNonMaximaSuppression3x3            _suppr_func;
-    std::unique_ptr<CLCopyToArrayKernel> _copy_array_kernel;
-    CLImage                              _output;
-    CLImage                              _suppr;
-    Window                               _win;
-    bool                                 _non_max;
-    unsigned int                        *_num_corners;
-    cl::Buffer                           _num_buffer;
-    ICLKeyPointArray                    *_corners;
-    uint8_t                              _constant_border_value;
-};
-}
-#endif /*ARM_COMPUTE_CLFASTCORNERS_H */

diff --git a/arm_compute/runtime/CL/functions/CLGaussian3x3.h b/arm_compute/runtime/CL/functions/CLGaussian3x3.h
deleted file mode 100644
index 20ce2b4..0000000
--- a/arm_compute/runtime/CL/functions/CLGaussian3x3.h
+++ /dev/null

@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGAUSSIAN3X3_H
-#define ARM_COMPUTE_CLGAUSSIAN3X3_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class ICLTensor;
-
-/** Basic function to execute gaussian filter 3x3. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLGaussian3x3Kernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class CLGaussian3x3 : public ICLSimpleFunction
-{
-public:
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_CLGAUSSIAN3X3_H */

diff --git a/arm_compute/runtime/CL/functions/CLGaussian5x5.h b/arm_compute/runtime/CL/functions/CLGaussian5x5.h
deleted file mode 100644
index d08cef2..0000000
--- a/arm_compute/runtime/CL/functions/CLGaussian5x5.h
+++ /dev/null

@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGAUSSIAN5X5_H
-#define ARM_COMPUTE_CLGAUSSIAN5X5_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class CLFillBorderKernel;
-class CLGaussian5x5HorKernel;
-class CLGaussian5x5VertKernel;
-class ICLTensor;
-
-/** Basic function to execute gaussian filter 5x5. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLGaussian5x5HorKernel
- * -# @ref CLGaussian5x5VertKernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class CLGaussian5x5 : public IFunction
-{
-public:
-    /** Default Constructor.
-     *
-     * @param[in] memory_manager (Optional) Memory manager.
-     */
-    CLGaussian5x5(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied */
-    CLGaussian5x5(const CLGaussian5x5 &) = delete;
-    /** Default move constructor */
-    CLGaussian5x5(CLGaussian5x5 &&) = default;
-    /** Prevent instances of this class from being copied */
-    CLGaussian5x5 &operator=(const CLGaussian5x5 &) = delete;
-    /** Default move assignment operator */
-    CLGaussian5x5 &operator=(CLGaussian5x5 &&) = default;
-    /** Default destructor */
-    ~CLGaussian5x5();
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    // Inherited methods overridden:
-    void run() override;
-
-protected:
-    MemoryGroup                              _memory_group;   /**< Function's memory group */
-    std::unique_ptr<CLGaussian5x5HorKernel>  _kernel_hor;     /**< Horizontal pass kernel */
-    std::unique_ptr<CLGaussian5x5VertKernel> _kernel_vert;    /**< Vertical pass kernel */
-    std::unique_ptr<CLFillBorderKernel>      _border_handler; /**< Kernel to handle image borders */
-    CLImage                                  _tmp;            /**< Temporary buffer */
-};
-}
-#endif /*ARM_COMPUTE_CLGAUSSIAN5X5_H */

diff --git a/arm_compute/runtime/CL/functions/CLGaussianPyramid.h b/arm_compute/runtime/CL/functions/CLGaussianPyramid.h
deleted file mode 100644
index 70be673..0000000
--- a/arm_compute/runtime/CL/functions/CLGaussianPyramid.h
+++ /dev/null

@@ -1,143 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGAUSSIANPYRAMID_H
-#define ARM_COMPUTE_CLGAUSSIANPYRAMID_H
-
-#include "arm_compute/core/IPyramid.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLPyramid.h"
-#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
-#include "arm_compute/runtime/IFunction.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class CLFillBorderKernel;
-class ICLTensor;
-class CLGaussianPyramidHorKernel;
-class CLGaussianPyramidVertKernel;
-class CLScaleKernel;
-
-/** Common interface for all Gaussian pyramid functions
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
-*/
-class CLGaussianPyramid : public IFunction
-{
-public:
-    /** Constructor */
-    CLGaussianPyramid();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGaussianPyramid(const CLGaussianPyramid &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGaussianPyramid &operator=(const CLGaussianPyramid &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGaussianPyramid(CLGaussianPyramid &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGaussianPyramid &operator=(CLGaussianPyramid &&) = default;
-    /** Default destructor */
-    ~CLGaussianPyramid();
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in, out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     pyramid               Destination pyramid tensors, Data types supported at each level: U8.
-     * @param[in]      border_mode           Border mode to use.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    virtual void configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value = 0) = 0;
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in]      compile_context       The compile context to be used.
-     * @param[in, out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     pyramid               Destination pyramid tensors, Data types supported at each level: U8.
-     * @param[in]      border_mode           Border mode to use.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    virtual void configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value = 0) = 0;
-
-protected:
-    ICLTensor *_input;
-    CLPyramid *_pyramid;
-    CLPyramid  _tmp;
-};
-
-/** Basic function to execute gaussian pyramid with HALF scale factor. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLGaussianPyramidHorKernel
- * -# @ref CLGaussianPyramidVertKernel
- */
-class CLGaussianPyramidHalf : public CLGaussianPyramid
-{
-public:
-    /** Constructor */
-    CLGaussianPyramidHalf();
-    /** Prevent instances of this class from being copied */
-    CLGaussianPyramidHalf(const CLGaussianPyramidHalf &) = delete;
-    /** Prevent instances of this class from being copied */
-    CLGaussianPyramidHalf &operator=(const CLGaussianPyramidHalf &) = delete;
-    /** Default destructor */
-    ~CLGaussianPyramidHalf();
-
-    // Inherited methods overridden:
-    void configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) override;
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) override;
-    void run() override;
-
-private:
-    std::vector<std::unique_ptr<CLFillBorderKernel>>          _horizontal_border_handler;
-    std::vector<std::unique_ptr<CLFillBorderKernel>>          _vertical_border_handler;
-    std::vector<std::unique_ptr<CLGaussianPyramidHorKernel>>  _horizontal_reduction;
-    std::vector<std::unique_ptr<CLGaussianPyramidVertKernel>> _vertical_reduction;
-};
-
-/** Basic function to execute gaussian pyramid with ORB scale factor. This function calls the following OpenCL kernels and functions:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLGaussian5x5
- * -# @ref CLScaleKernel
- */
-class CLGaussianPyramidOrb : public CLGaussianPyramid
-{
-public:
-    /** Constructor */
-    CLGaussianPyramidOrb();
-
-    // Inherited methods overridden:
-    void configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) override;
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) override;
-    void run() override;
-
-private:
-    std::vector<CLGaussian5x5>                  _gauss5x5;
-    std::vector<std::unique_ptr<CLScaleKernel>> _scale_nearest;
-};
-}
-#endif /*ARM_COMPUTE_CLGAUSSIANPYRAMID_H */

diff --git a/arm_compute/runtime/CL/functions/CLHOGDescriptor.h b/arm_compute/runtime/CL/functions/CLHOGDescriptor.h
deleted file mode 100644
index 87bcd7f..0000000
--- a/arm_compute/runtime/CL/functions/CLHOGDescriptor.h
+++ /dev/null

@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHOGDESCRIPTOR_H
-#define ARM_COMPUTE_CLHOGDESCRIPTOR_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-class IHOG;
-class CLHOGOrientationBinningKernel;
-class CLHOGBlockNormalizationKernel;
-/** Basic function to calculate HOG descriptor. This function calls the following OpenCL kernels:
- *
- * -# @ref CLHOGGradient
- * -# @ref CLHOGOrientationBinningKernel
- * -# @ref CLHOGBlockNormalizationKernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class CLHOGDescriptor : public IFunction
-{
-public:
-    /** Default constructor */
-    CLHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied */
-    CLHOGDescriptor(const CLHOGDescriptor &) = delete;
-    /** Prevent instances of this class from being copied */
-    CLHOGDescriptor &operator=(const CLHOGDescriptor &) = delete;
-    /** Default destructor */
-    ~CLHOGDescriptor();
-    /** Initialise the function's source, destination, HOG data-object and border mode
-     *
-     * @param[in, out] input                 Input tensor. Data type supported: U8
-     *                                       (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output                Output tensor which stores the HOG descriptor. DataType supported: F32. The number of channels is equal to the number of histogram bins per block
-     * @param[in]      hog                   HOG data object which describes the HOG descriptor
-     * @param[in]      border_mode           Border mode to use.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the function's source, destination, HOG data-object and border mode
-     *
-     * @param[in]      compile_context       The compile context to be used.
-     * @param[in, out] input                 Input tensor. Data type supported: U8
-     *                                       (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output                Output tensor which stores the HOG descriptor. DataType supported: F32. The number of channels is equal to the number of histogram bins per block
-     * @param[in]      hog                   HOG data object which describes the HOG descriptor
-     * @param[in]      border_mode           Border mode to use.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    // Inherited method overridden:
-    void run() override;
-
-private:
-    MemoryGroup                                    _memory_group;
-    CLHOGGradient                                  _gradient;
-    std::unique_ptr<CLHOGOrientationBinningKernel> _orient_bin;
-    std::unique_ptr<CLHOGBlockNormalizationKernel> _block_norm;
-    CLTensor                                       _mag;
-    CLTensor                                       _phase;
-    CLTensor                                       _hog_space;
-};
-}
-
-#endif /* ARM_COMPUTE_CLHOGDESCRIPTOR_H */

diff --git a/arm_compute/runtime/CL/functions/CLHOGDetector.h b/arm_compute/runtime/CL/functions/CLHOGDetector.h
deleted file mode 100644
index 539a521..0000000
--- a/arm_compute/runtime/CL/functions/CLHOGDetector.h
+++ /dev/null

@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHOGDETECTOR_H
-#define ARM_COMPUTE_CLHOGDETECTOR_H
-
-#include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/IHOG.h"
-#include "arm_compute/runtime/IFunction.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class CLHOGDetectorKernel;
-class ICLTensor;
-class ICLHOG;
-
-/** Basic function to execute HOG detector based on linear SVM. This function calls the following OpenCL kernel:
- *
- * -# @ref CLHOGDetectorKernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class CLHOGDetector : public IFunction
-{
-public:
-    /** Default constructor */
-    CLHOGDetector();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGDetector(const CLHOGDetector &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGDetector &operator=(const CLHOGDetector &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHOGDetector(CLHOGDetector &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHOGDetector &operator=(CLHOGDetector &&) = default;
-    /** Default destructor */
-    ~CLHOGDetector();
-    /** Initialise the kernel's input, output, HOG data object, detection window stride, threshold and index class
-     *
-     * @attention The function does not reset the number of values in @ref IDetectionWindowArray so it is caller's responsibility to clear it.
-     *
-     * @param[in]  input                   Input tensor. It is the output of @ref CLHOGDescriptor. Data type supported: F32
-     * @param[in]  hog                     HOG data-object that describes the HOG descriptor
-     * @param[out] detection_windows       Array of @ref DetectionWindow used to store the detected objects
-     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
-     *                                     It must be multiple of the block stride stored in hog
-     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
-     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
-     */
-    void configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold = 0.0f, size_t idx_class = 0);
-    /** Initialise the kernel's input, output, HOG data object, detection window stride, threshold and index class
-     *
-     * @attention The function does not reset the number of values in @ref IDetectionWindowArray so it is caller's responsibility to clear it.
-     *
-     * @param[in]  compile_context         The compile context to be used.
-     * @param[in]  input                   Input tensor. It is the output of @ref CLHOGDescriptor. Data type supported: F32
-     * @param[in]  hog                     HOG data-object that describes the HOG descriptor
-     * @param[out] detection_windows       Array of @ref DetectionWindow used to store the detected objects
-     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
-     *                                     It must be multiple of the block stride stored in hog
-     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
-     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride,
-                   float  threshold = 0.0f,
-                   size_t idx_class = 0);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    std::unique_ptr<CLHOGDetectorKernel> _hog_detector_kernel;
-    ICLDetectionWindowArray             *_detection_windows;
-    cl::Buffer                           _num_detection_windows;
-};
-}
-
-#endif /* ARM_COMPUTE_CLHOGDETECTOR_H */

diff --git a/arm_compute/runtime/CL/functions/CLHOGGradient.h b/arm_compute/runtime/CL/functions/CLHOGGradient.h
deleted file mode 100644
index 569490f..0000000
--- a/arm_compute/runtime/CL/functions/CLHOGGradient.h
+++ /dev/null

@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHOGGRADIENT_H
-#define ARM_COMPUTE_CLHOGGRADIENT_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLDerivative.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class CLMagnitudePhaseKernel;
-class ITensorInfo;
-/** Basic function to calculate the gradient for HOG. This function calls the following OpenCL kernels:
- *
- * -# @ref CLDerivative
- * -# @ref CLMagnitudePhaseKernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class CLHOGGradient : public IFunction
-{
-public:
-    /** Default constructor */
-    CLHOGGradient(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Initialise the function's source, destinations, phase type and border mode
-     *
-     * @param[in, out] input                 Input tensor. Data type supported: U8.
-     *                                       (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output_magnitude      Output tensor (magnitude). Data type supported: U16.
-     * @param[out]     output_phase          Output tensor.(phase). Format supported: U8
-     * @param[in]      phase_type            Type of @ref PhaseType
-     * @param[in]      border_mode           Border mode to use
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the function's source, destinations, phase type and border mode
-     *
-     * @param[in]      compile_context       The compile context to be used.
-     * @param[in, out] input                 Input tensor. Data type supported: U8.
-     *                                       (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output_magnitude      Output tensor (magnitude). Data type supported: U16.
-     * @param[out]     output_phase          Output tensor.(phase). Format supported: U8
-     * @param[in]      phase_type            Type of @ref PhaseType
-     * @param[in]      border_mode           Border mode to use
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode,
-                   uint8_t constant_border_value = 0);
-
-    // Inherited method overridden:
-    void run() override;
-
-private:
-    MemoryGroup                             _memory_group;
-    CLDerivative                            _derivative;
-    std::unique_ptr<CLMagnitudePhaseKernel> _mag_phase;
-    CLTensor                                _gx;
-    CLTensor                                _gy;
-};
-}
-#endif /*ARM_COMPUTE_CLHOGGRADIENT_H */

diff --git a/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h b/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h
deleted file mode 100644
index b9a5165..0000000
--- a/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h
+++ /dev/null

@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHOGMULTIDETECTION_H
-#define ARM_COMPUTE_CLHOGMULTIDETECTION_H
-
-#include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/ICLMultiHOG.h"
-#include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
-#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class CLHOGOrientationBinningKernel;
-class CLHOGBlockNormalizationKernel;
-/** Basic function to detect multiple objects (or the same object at different scales) on the same input image using HOG. This function calls the following kernels:
- *
- * -# @ref CLHOGGradient
- * -# @ref CLHOGOrientationBinningKernel
- * -# @ref CLHOGBlockNormalizationKernel
- * -# @ref CLHOGDetector
- * -# @ref CPPDetectionWindowNonMaximaSuppressionKernel (executed if non_maxima_suppression == true)
- *
- * @note This implementation works if all the HOG data-objects within the IMultiHOG container have the same:
- *       -# Phase type
-         -# Normalization type
-         -# L2 hysteresis threshold if the normalization type is L2HYS_NORM
- *
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class CLHOGMultiDetection : public IFunction
-{
-public:
-    /** Default constructor */
-    CLHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGMultiDetection(const CLHOGMultiDetection &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGMultiDetection &operator=(const CLHOGMultiDetection &) = delete;
-    /** Default destructor */
-    ~CLHOGMultiDetection();
-    /** Initialise the function's source, destination, detection window strides, border mode, threshold and non-maxima suppression
-     *
-     * @param[in, out] input                    Input tensor. Data type supported: U8
-     *                                          (Written to only for @p border_mode != UNDEFINED)
-     * @param[in]      multi_hog                Container of multiple HOG data object. Each HOG data object describes one HOG model to detect.
-     *                                          This container should store the HOG data-objects in descending or ascending cell_size width order.
-     *                                          This will help to understand if the HOG descriptor computation can be skipped for some HOG data-objects
-     * @param[out]     detection_windows        Array of @ref DetectionWindow used for locating the detected objects
-     * @param[in]      detection_window_strides Array of @ref Size2D used to specify the distance in pixels between 2 consecutive detection windows in x and y directions for each HOG data-object
-     *                                          The dimension of this array must be the same of multi_hog->num_models()
-     *                                          The i-th detection_window_stride of this array must be multiple of the block_stride stored in the i-th multi_hog array
-     * @param[in]      border_mode              Border mode to use.
-     * @param[in]      constant_border_value    (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     * @param[in]      threshold                (Optional) Threshold for the distance between features and SVM classifying plane
-     * @param[in]      non_maxima_suppression   (Optional) Flag to specify whether the non-maxima suppression is required or not.
-     *                                          True if the non-maxima suppression stage has to be computed
-     * @param[in]      min_distance             (Optional) Radial Euclidean distance to use for the non-maxima suppression stage
-     *
-     */
-    void configure(ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, ICLSize2DArray *detection_window_strides, BorderMode border_mode,
-                   uint8_t constant_border_value = 0, float threshold = 0.0f, bool non_maxima_suppression = false, float min_distance = 1.0f);
-    /** Initialise the function's source, destination, detection window strides, border mode, threshold and non-maxima suppression
-     *
-     * @param[in]      compile_context          The compile context to be used.
-     * @param[in, out] input                    Input tensor. Data type supported: U8
-     *                                          (Written to only for @p border_mode != UNDEFINED)
-     * @param[in]      multi_hog                Container of multiple HOG data object. Each HOG data object describes one HOG model to detect.
-     *                                          This container should store the HOG data-objects in descending or ascending cell_size width order.
-     *                                          This will help to understand if the HOG descriptor computation can be skipped for some HOG data-objects
-     * @param[out]     detection_windows        Array of @ref DetectionWindow used for locating the detected objects
-     * @param[in]      detection_window_strides Array of @ref Size2D used to specify the distance in pixels between 2 consecutive detection windows in x and y directions for each HOG data-object
-     *                                          The dimension of this array must be the same of multi_hog->num_models()
-     *                                          The i-th detection_window_stride of this array must be multiple of the block_stride stored in the i-th multi_hog array
-     * @param[in]      border_mode              Border mode to use.
-     * @param[in]      constant_border_value    (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     * @param[in]      threshold                (Optional) Threshold for the distance between features and SVM classifying plane
-     * @param[in]      non_maxima_suppression   (Optional) Flag to specify whether the non-maxima suppression is required or not.
-     *                                          True if the non-maxima suppression stage has to be computed
-     * @param[in]      min_distance             (Optional) Radial Euclidean distance to use for the non-maxima suppression stage
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, ICLSize2DArray *detection_window_strides,
-                   BorderMode border_mode, uint8_t constant_border_value = 0, float threshold = 0.0f, bool non_maxima_suppression = false, float min_distance = 1.0f);
-
-    // Inherited method overridden:
-    void run() override;
-
-private:
-    MemoryGroup                                                 _memory_group;
-    CLHOGGradient                                               _gradient_kernel;
-    std::vector<std::unique_ptr<CLHOGOrientationBinningKernel>> _orient_bin_kernel;
-    std::vector<std::unique_ptr<CLHOGBlockNormalizationKernel>> _block_norm_kernel;
-    std::vector<CLHOGDetector>                                  _hog_detect_kernel;
-    CPPDetectionWindowNonMaximaSuppressionKernel                _non_maxima_kernel;
-    std::vector<CLTensor>                                       _hog_space;
-    std::vector<CLTensor>                                       _hog_norm_space;
-    ICLDetectionWindowArray                                    *_detection_windows;
-    CLTensor                                                    _mag;
-    CLTensor                                                    _phase;
-    bool                                                        _non_maxima_suppression;
-    size_t                                                      _num_orient_bin_kernel;
-    size_t                                                      _num_block_norm_kernel;
-    size_t                                                      _num_hog_detect_kernel;
-};
-}
-
-#endif /* ARM_COMPUTE_CLHOGMULTIDETECTION_H */

diff --git a/arm_compute/runtime/CL/functions/CLHarrisCorners.h b/arm_compute/runtime/CL/functions/CLHarrisCorners.h
deleted file mode 100644
index 7f4a456..0000000
--- a/arm_compute/runtime/CL/functions/CLHarrisCorners.h
+++ /dev/null

@@ -1,129 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHARRISCORNERS_H
-#define ARM_COMPUTE_CLHARRISCORNERS_H
-
-#include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h"
-#include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class CLHarrisScoreKernel;
-class CLFillBorderKernel;
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Basic function to execute harris corners detection. This function calls the following CL and Neon kernels and functions:
- *
- * @note Requires CPU support for the kernels: CPPCornerCandidatesKernel and CPPSortEuclideanDistanceKernel.
- *
- * -# @ref CLSobel3x3 (if gradient_size == 3) or<br/>
- *    @ref CLSobel5x5 (if gradient_size == 5) or<br/>
- *    @ref CLSobel7x7 (if gradient_size == 7)
- * -# @ref CLFillBorderKernel
- * -# @ref CLHarrisScoreKernel
- * -# @ref CLNonMaximaSuppression3x3
- * -# @ref CPPCornerCandidatesKernel
- * -# @ref CPPSortEuclideanDistanceKernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class CLHarrisCorners : public IFunction
-{
-public:
-    /** Constructor */
-    CLHarrisCorners(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHarrisCorners(const CLHarrisCorners &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    const CLHarrisCorners &operator=(const CLHarrisCorners &) = delete;
-    /** Default destructor */
-    ~CLHarrisCorners();
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in,out] input                 Source image. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[in]     threshold             Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
-     * @param[in]     min_dist              Radial Euclidean distance for the euclidean distance stage.
-     * @param[in]     sensitivity           Sensitivity threshold k from the Harris-Stephens equation
-     * @param[in]     gradient_size         The gradient window size to use on the input. The implementation supports 3, 5, and 7
-     * @param[in]     block_size            The block window size used to compute the Harris Corner score. The implementation supports 3, 5, and 7.
-     * @param[out]    corners               Array of keypoints to store the results.
-     * @param[in]     border_mode           Border mode to use
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     * @param[in]     use_fp16              (Optional) If true the FP16 kernels will be used. If false F32 kernels are used.
-     */
-    void configure(ICLImage *input, float threshold, float min_dist, float sensitivity,
-                   int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners,
-                   BorderMode border_mode, uint8_t constant_border_value = 0, bool use_fp16 = false);
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source image. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[in]     threshold             Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
-     * @param[in]     min_dist              Radial Euclidean distance for the euclidean distance stage.
-     * @param[in]     sensitivity           Sensitivity threshold k from the Harris-Stephens equation
-     * @param[in]     gradient_size         The gradient window size to use on the input. The implementation supports 3, 5, and 7
-     * @param[in]     block_size            The block window size used to compute the Harris Corner score. The implementation supports 3, 5, and 7.
-     * @param[out]    corners               Array of keypoints to store the results.
-     * @param[in]     border_mode           Border mode to use
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     * @param[in]     use_fp16              (Optional) If true the FP16 kernels will be used. If false F32 kernels are used.
-     */
-    void configure(const CLCompileContext &compile_context, ICLImage *input, float threshold, float min_dist, float sensitivity,
-                   int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners,
-                   BorderMode border_mode, uint8_t constant_border_value = 0, bool use_fp16 = false);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    MemoryGroup                          _memory_group;          /**< Function's memory group */
-    std::unique_ptr<IFunction>           _sobel;                 /**< Sobel function */
-    std::unique_ptr<CLHarrisScoreKernel> _harris_score;          /**< Harris score kernel */
-    CLNonMaximaSuppression3x3            _non_max_suppr;         /**< Non-maxima suppression function */
-    CPPCornerCandidatesKernel            _candidates;            /**< Sort kernel */
-    CPPSortEuclideanDistanceKernel       _sort_euclidean;        /**< Euclidean distance kernel */
-    std::unique_ptr<CLFillBorderKernel>  _border_gx;             /**< Border handler before running harris score */
-    std::unique_ptr<CLFillBorderKernel>  _border_gy;             /**< Border handler before running harris score */
-    CLImage                              _gx;                    /**< Source image - Gx component */
-    CLImage                              _gy;                    /**< Source image - Gy component */
-    CLImage                              _score;                 /**< Source image - Harris score */
-    CLImage                              _nonmax;                /**< Source image - Non-Maxima suppressed image */
-    std::vector<InternalKeypoint>        _corners_list;          /**< Array of InternalKeypoint. It stores the potential corner candidates */
-    int32_t                              _num_corner_candidates; /**< Number of potential corner candidates */
-    ICLKeyPointArray                    *_corners;               /**< Output corners array */
-};
-}
-#endif /*ARM_COMPUTE_CLHARRISCORNERS_H */

diff --git a/arm_compute/runtime/CL/functions/CLHistogram.h b/arm_compute/runtime/CL/functions/CLHistogram.h
deleted file mode 100644
index b45a79e..0000000
--- a/arm_compute/runtime/CL/functions/CLHistogram.h
+++ /dev/null

@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHISTOGRAM_H
-#define ARM_COMPUTE_CLHISTOGRAM_H
-
-#include "arm_compute/runtime/IFunction.h"
-#include "src/core/CL/kernels/CLHistogramKernel.h"
-
-namespace arm_compute
-{
-class ICLDistribution1D;
-class ICLTensor;
-
-/** Basic function to execute histogram. This function calls the following OpenCL kernels:
- *
- *  -# @ref CLHistogramKernel
- *  -# @ref CLHistogramBorderKernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class CLHistogram : public IFunction
-{
-public:
-    /*
-     * @ Default constructor
-     */
-    CLHistogram();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHistogram(const CLHistogram &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    const CLHistogram &operator=(const CLHistogram &) = delete;
-    /** Initialize the function
-     *
-     * @param[in]  input  Source image. Data types supported: U8
-     * @param[out] output Output distribution.
-     */
-    void configure(const ICLImage *input, ICLDistribution1D *output);
-    /** Initialize the function
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source image. Data types supported: U8
-     * @param[out] output          Output distribution.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    CLHistogramKernel       _kernel;        /**< kernel to run */
-    CLHistogramBorderKernel _kernel_border; /**< Border kernel to run */
-};
-}
-#endif /*ARM_COMPUTE_CLHISTOGRAM_H */

diff --git a/arm_compute/runtime/CL/functions/CLIntegralImage.h b/arm_compute/runtime/CL/functions/CLIntegralImage.h
deleted file mode 100644
index b6c98dc..0000000
--- a/arm_compute/runtime/CL/functions/CLIntegralImage.h
+++ /dev/null

@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLINTEGRALIMAGE_H
-#define ARM_COMPUTE_CLINTEGRALIMAGE_H
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class CLIntegralImageHorKernel;
-class CLIntegralImageVertKernel;
-class ICLTensor;
-
-/** Basic function to execute integral image. This function calls the following OpenCL kernels:
- *
- * -# @ref CLIntegralImageHorKernel
- * -# @ref CLIntegralImageVertKernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class CLIntegralImage : public IFunction
-{
-public:
-    /** Default Constructor. */
-    CLIntegralImage();
-    /** Prevent instances of this class from being copied */
-    CLIntegralImage(const CLIntegralImage &) = delete;
-    /** Prevent instances of this class from being copied */
-    CLIntegralImage &operator=(const CLIntegralImage &) = delete;
-    /** Default destructor */
-    ~CLIntegralImage();
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in]  input  Source tensor. Data types supported: U8.
-     * @param[out] output Destination tensor, Data types supported: U32.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8.
-     * @param[out] output          Destination tensor, Data types supported: U32.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-
-    // Inherited methods overridden:
-    void run() override;
-
-protected:
-    std::unique_ptr<CLIntegralImageHorKernel>  _integral_hor;  /**< Integral Image Horizontal kernel */
-    std::unique_ptr<CLIntegralImageVertKernel> _integral_vert; /**< Integral Image Vertical kernel */
-};
-}
-#endif /*ARM_COMPUTE_CLINTEGRALIMAGE_H */

diff --git a/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h b/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h
deleted file mode 100644
index 875b714..0000000
--- a/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h
+++ /dev/null

@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLLAPLACIANPYRAMID_H
-#define ARM_COMPUTE_CLLAPLACIANPYRAMID_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLPyramid.h"
-#include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h"
-#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
-#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
-#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
-#include "arm_compute/runtime/IFunction.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute laplacian pyramid. This function calls the following OpenCL kernels and functions:
- *
- * -# @ref CLGaussianPyramidHalf
- * -# @ref CLGaussian5x5
- * -# @ref CLArithmeticSubtraction
- *
- *  First a Gaussian pyramid is created. Then, for each level i, the corresponding tensor I(i) is blurred with the Gaussian 5x5 filter, and then
- *  difference between the two tensors is the corresponding level L(i) of the Laplacian pyramid.
- *  L(i) = I(i) - Gaussian5x5(I(i))
- *  Level 0 has always the same first two dimensions as the input tensor.
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
-*/
-class CLLaplacianPyramid : public IFunction
-{
-public:
-    /** Constructor */
-    CLLaplacianPyramid();
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in]  input                 Source tensor. Data types supported: U8.
-     * @param[out] pyramid               Destination pyramid tensors, Data types supported at each level: S16.
-     * @param[out] output                The lowest resolution tensor necessary to reconstruct the input tensor from the pyramid. Data types supported: S16.
-     *                                   The first two dimensions of this tensor must match the first two dimensions of the tensor in the last level of the pyramid, that is:
-     *                                   output.width = input.width() / pow(2,pyramid_levels-1) and out.height = in.height() / pow(2,pyramid_levels-1)
-     * @param[in]  border_mode           Border mode to use.
-     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(ICLTensor *input, CLPyramid *pyramid, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in]  compile_context       The compile context to be used.
-     * @param[in]  input                 Source tensor. Data types supported: U8.
-     * @param[out] pyramid               Destination pyramid tensors, Data types supported at each level: S16.
-     * @param[out] output                The lowest resolution tensor necessary to reconstruct the input tensor from the pyramid. Data types supported: S16.
-     *                                   The first two dimensions of this tensor must match the first two dimensions of the tensor in the last level of the pyramid, that is:
-     *                                   output.width = input.width() / pow(2,pyramid_levels-1) and out.height = in.height() / pow(2,pyramid_levels-1)
-     * @param[in]  border_mode           Border mode to use.
-     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    size_t                               _num_levels;
-    CLGaussianPyramidHalf                _gaussian_pyr_function;
-    std::vector<CLGaussian5x5>           _convf;
-    std::vector<CLArithmeticSubtraction> _subf;
-    CLDepthConvertLayer                  _depth_function;
-    CLPyramid                            _gauss_pyr;
-    CLPyramid                            _conv_pyr;
-};
-}
-#endif /*ARM_COMPUTE_CLLAPLACIANPYRAMID_H */

diff --git a/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h b/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h
deleted file mode 100644
index c780b56..0000000
--- a/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h
+++ /dev/null

@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLLAPLACIANRECONSTRUCT_H
-#define ARM_COMPUTE_CLLAPLACIANRECONSTRUCT_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLPyramid.h"
-#include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h"
-#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
-#include "arm_compute/runtime/CL/functions/CLScale.h"
-#include "arm_compute/runtime/IFunction.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Basic function to execute laplacian reconstruction. This function calls the following OpenCL kernels and functions:
- *
- * -# @ref CLArithmeticAddition
- * -# @ref CLScale
- * -# @ref CLDepthConvertLayer
- *
- * This function reconstructs the original image from a Laplacian Image Pyramid.
- *
- *  The input image is added to the last level of the Laplacian pyramid L(n-2), the resulting image is upsampled to the
- *  resolution of the next pyramid level.
- *
- *  I(n-2) = upsample( input + L(n-1)
- *
- *  For each pyramid level i, except i=0 and i=n-1:
- *  I(i-1) = upsample(I(i) + L(i))
- *
- *  output = I(0) + L(0)
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
-*/
-class CLLaplacianReconstruct : public IFunction
-{
-public:
-    /** Constructor */
-    CLLaplacianReconstruct();
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * The Output image must have the same size as the first level of the pyramid.
-     * The Input image must have the same size as the last level of the pyramid.
-     *
-     * The idea is to reconstuct the original hi-res image from a low-res representation of it and the laplacian pyramid.
-     *
-     * @param[in]  pyramid               Laplacian pyramid tensors, Data types supported at each level: S16.
-     * @param[in]  input                 Source tensor. Data types supported: S16.
-     * @param[out] output                Output tensor. Data types supported: U8.
-     * @param[in]  border_mode           Border mode to use for the convolution.
-     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(const CLPyramid *pyramid, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * The Output image must have the same size as the first level of the pyramid.
-     * The Input image must have the same size as the last level of the pyramid.
-     *
-     * The idea is to reconstuct the original hi-res image from a low-res representation of it and the laplacian pyramid.
-     *
-     * @param[in]  compile_context       The compile context to be used.
-     * @param[in]  pyramid               Laplacian pyramid tensors, Data types supported at each level: S16.
-     * @param[in]  input                 Source tensor. Data types supported: S16.
-     * @param[out] output                Output tensor. Data types supported: U8.
-     * @param[in]  border_mode           Border mode to use for the convolution.
-     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, const CLPyramid *pyramid, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    CLPyramid                         _tmp_pyr;
-    std::vector<CLArithmeticAddition> _addf;
-    std::vector<CLScale>              _scalef;
-    CLDepthConvertLayer               _depthf;
-};
-}
-#endif /*ARM_COMPUTE_CLLAPLACIANRECONSTRUCT_H */

diff --git a/arm_compute/runtime/CL/functions/CLMagnitude.h b/arm_compute/runtime/CL/functions/CLMagnitude.h
deleted file mode 100644
index 4ed1414..0000000
--- a/arm_compute/runtime/CL/functions/CLMagnitude.h
+++ /dev/null

@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMAGNITUDE_H
-#define ARM_COMPUTE_CLMAGNITUDE_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class CLCompileContext;
-class ICLTensor;
-
-/** Basic function to run @ref CLMagnitudePhaseKernel.
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
-*/
-class CLMagnitude : public ICLSimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs.
-     *
-     * @param[in]  input1   First tensor input. Data types supported: S16.
-     * @param[in]  input2   Second tensor input. Data types supported: S16.
-     * @param[out] output   Output tensor. Data types supported: S16.
-     * @param[in]  mag_type (Optional) Magnitude calculation type. Default: L2NORM.
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type = MagnitudeType::L2NORM);
-    /** Initialise the kernel's inputs.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          First tensor input. Data types supported: S16.
-     * @param[in]  input2          Second tensor input. Data types supported: S16.
-     * @param[out] output          Output tensor. Data types supported: S16.
-     * @param[in]  mag_type        (Optional) Magnitude calculation type. Default: L2NORM.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type = MagnitudeType::L2NORM);
-};
-}
-#endif /*ARM_COMPUTE_CLMAGNITUDE_H */

diff --git a/arm_compute/runtime/CL/functions/CLMeanStdDev.h b/arm_compute/runtime/CL/functions/CLMeanStdDev.h
deleted file mode 100644
index d9ced13..0000000
--- a/arm_compute/runtime/CL/functions/CLMeanStdDev.h
+++ /dev/null

@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMEANSTDDEV_H
-#define ARM_COMPUTE_CLMEANSTDDEV_H
-
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class ICLTensor;
-class ITensorInfo;
-class CLFillBorderKernel;
-class CLMeanStdDevKernel;
-/** Basic function to execute mean and standard deviation by calling @ref CLMeanStdDevKernel */
-class CLMeanStdDev : public IFunction
-{
-public:
-    /** Default Constructor. */
-    CLMeanStdDev(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMeanStdDev(const CLMeanStdDev &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMeanStdDev &operator=(const CLMeanStdDev &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMeanStdDev(CLMeanStdDev &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMeanStdDev &operator=(CLMeanStdDev &&) = default;
-    /** Default destructor */
-    ~CLMeanStdDev();
-    /** Initialise the kernel's inputs and outputs.
-     *
-     * @param[in, out] input  Input image. Data types supported: U8/F16/F32. (Written to only for border filling)
-     * @param[out]     mean   Output average pixel value.
-     * @param[out]     stddev (Optional) Output standard deviation of pixel values.
-     */
-    void configure(ICLImage *input, float *mean, float *stddev = nullptr);
-    /** Initialise the kernel's inputs and outputs.
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] input           Input image. Data types supported: U8/F16/F32. (Written to only for border filling)
-     * @param[out]     mean            Output average pixel value.
-     * @param[out]     stddev          (Optional) Output standard deviation of pixel values.
-     */
-    void configure(const CLCompileContext &compile_context, ICLImage *input, float *mean, float *stddev = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDev
-     *
-     * @param[in] input  Input image. Data types supported: U8/F16/F32.
-     * @param[in] mean   Output average pixel value.
-     * @param[in] stddev (Optional) Output standard deviation of pixel values.
-     *
-     * @return a status
-     */
-    static Status validate(ITensorInfo *input, float *mean, float *stddev = nullptr);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    template <typename T>
-    void run_float();
-    void run_int();
-
-    MemoryGroup                         _memory_group;               /**< Function's memory group */
-    DataType                            _data_type;                  /**< Input data type. */
-    unsigned int                        _num_pixels;                 /**< Number of image's pixels. */
-    bool                                _run_stddev;                 /**< Flag for knowing if we should run stddev reduction function. */
-    CLReductionOperation                _reduction_operation_mean;   /**< Reduction operation function for computing mean value. */
-    CLReductionOperation                _reduction_operation_stddev; /**< Reduction operation function for computing standard deviation. */
-    CLTensor                            _reduction_output_mean;      /**< Reduction operation output tensor for mean value. */
-    CLTensor                            _reduction_output_stddev;    /**< Reduction operation output tensor for standard deviation value. */
-    float                              *_mean;                       /**< Pointer that holds the mean value. */
-    float                              *_stddev;                     /**< Pointer that holds the standard deviation value. */
-    std::unique_ptr<CLMeanStdDevKernel> _mean_stddev_kernel;         /**< Kernel that standard deviation calculation. */
-    std::unique_ptr<CLFillBorderKernel> _fill_border_kernel;         /**< Kernel that fills the border with zeroes. */
-    cl::Buffer                          _global_sum;                 /**< Variable that holds the global sum among calls in order to ease reduction */
-    cl::Buffer                          _global_sum_squared;         /**< Variable that holds the global sum of squared values among calls in order to ease reduction */
-};
-}
-#endif /*ARM_COMPUTE_CLMEANSTDDEV_H */

diff --git a/arm_compute/runtime/CL/functions/CLMedian3x3.h b/arm_compute/runtime/CL/functions/CLMedian3x3.h
deleted file mode 100644
index 1fe318e..0000000
--- a/arm_compute/runtime/CL/functions/CLMedian3x3.h
+++ /dev/null

@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMEDIAN3X3_H
-#define ARM_COMPUTE_CLMEDIAN3X3_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class ICLTensor;
-
-/** Basic function to execute median filter. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLMedian3x3Kernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class CLMedian3x3 : public ICLSimpleFunction
-{
-public:
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_CLMEDIAN3X3_H */

diff --git a/arm_compute/runtime/CL/functions/CLMinMaxLocation.h b/arm_compute/runtime/CL/functions/CLMinMaxLocation.h
deleted file mode 100644
index 77c381f..0000000
--- a/arm_compute/runtime/CL/functions/CLMinMaxLocation.h
+++ /dev/null

@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMINMAXLOCATION_H
-#define ARM_COMPUTE_CLMINMAXLOCATION_H
-
-#include "arm_compute/runtime/CL/CLArray.h"
-#include "arm_compute/runtime/IFunction.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class CLMinMaxKernel;
-class CLMinMaxLocationKernel;
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Basic function to execute min and max location. This function calls the following OpenCL kernels:
- *
- * -# @ref CLMinMaxKernel
- * -# @ref CLMinMaxLocationKernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class CLMinMaxLocation : public IFunction
-{
-public:
-    /** Constructor */
-    CLMinMaxLocation();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxLocation(const CLMinMaxLocation &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxLocation &operator=(const CLMinMaxLocation &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMinMaxLocation(CLMinMaxLocation &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMinMaxLocation &operator=(CLMinMaxLocation &&) = default;
-    /** Default destructor */
-    ~CLMinMaxLocation();
-    /** Initialise the kernel's inputs and outputs.
-     *
-     * @note When locations of min and max occurrences are requested, the reported number of locations is limited to the given array size.
-     *
-     * @param[in]  input     Input image. Data types supported: U8/S16/F32.
-     * @param[out] min       Minimum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
-     * @param[out] max       Maximum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
-     * @param[out] min_loc   (Optional) Array of Coordinates2D used to store minimum value locations.
-     * @param[out] max_loc   (Optional) Array of Coordinates2D used to store maximum value locations.
-     * @param[out] min_count (Optional) Number of minimum value encounters.
-     * @param[out] max_count (Optional) Number of maximum value encounters.
-     */
-    void configure(const ICLImage *input, void *min, void *max,
-                   CLCoordinates2DArray *min_loc = nullptr, CLCoordinates2DArray *max_loc = nullptr,
-                   uint32_t *min_count = nullptr, uint32_t *max_count = nullptr);
-    /** Initialise the kernel's inputs and outputs.
-     *
-     * @note When locations of min and max occurrences are requested, the reported number of locations is limited to the given array size.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input image. Data types supported: U8/S16/F32.
-     * @param[out] min             Minimum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
-     * @param[out] max             Maximum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
-     * @param[out] min_loc         (Optional) Array of Coordinates2D used to store minimum value locations.
-     * @param[out] max_loc         (Optional) Array of Coordinates2D used to store maximum value locations.
-     * @param[out] min_count       (Optional) Number of minimum value encounters.
-     * @param[out] max_count       (Optional) Number of maximum value encounters.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, void *min, void *max,
-                   CLCoordinates2DArray *min_loc = nullptr, CLCoordinates2DArray *max_loc = nullptr,
-                   uint32_t *min_count = nullptr, uint32_t *max_count = nullptr);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    std::unique_ptr<CLMinMaxKernel>         _min_max_kernel;     /**< Kernel that performs min/max */
-    std::unique_ptr<CLMinMaxLocationKernel> _min_max_loc_kernel; /**< Kernel that counts min/max occurrences and identifies their positions */
-    cl::Buffer                              _min_max_vals;       /**< Buffer to collect min, max values */
-    cl::Buffer                              _min_max_count_vals; /**< Buffer to collect min, max values */
-    void                                   *_min;                /**< Minimum value. */
-    void                                   *_max;                /**< Maximum value. */
-    uint32_t                               *_min_count;          /**< Minimum value occurrences. */
-    uint32_t                               *_max_count;          /**< Maximum value occurrences. */
-    CLCoordinates2DArray                   *_min_loc;            /**< Minimum value occurrences coordinates. */
-    CLCoordinates2DArray                   *_max_loc;            /**< Maximum value occurrences  coordinates. */
-};
-}
-#endif /*ARM_COMPUTE_CLMINMAXLOCATION_H */

diff --git a/arm_compute/runtime/CL/functions/CLNonLinearFilter.h b/arm_compute/runtime/CL/functions/CLNonLinearFilter.h
deleted file mode 100644
index 3d0947d..0000000
--- a/arm_compute/runtime/CL/functions/CLNonLinearFilter.h
+++ /dev/null

@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLNONLINEARFILTER_H
-#define ARM_COMPUTE_CLNONLINEARFILTER_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class ICLTensor;
-
-/** Basic function to execute non linear filter. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLNonLinearFilterKernel
- *
- * @note Supported mask dimensions squares of sizes 3, 5
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class CLNonLinearFilter : public ICLSimpleFunction
-{
-public:
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor. Data types supported: U8
-     * @param[in]     function              Non linear function to perform
-     * @param[in]     mask_size             Mask size. Supported sizes: 3, 5
-     * @param[in]     pattern               Mask pattern
-     * @param[in]     mask                  The given mask. Will be used only if pattern is specified to PATTERN_OTHER
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
-                   BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor. Data types supported: U8
-     * @param[in]     function              Non linear function to perform
-     * @param[in]     mask_size             Mask size. Supported sizes: 3, 5
-     * @param[in]     pattern               Mask pattern
-     * @param[in]     mask                  The given mask. Will be used only if pattern is specified to PATTERN_OTHER
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
-                   BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_CLNONLINEARFILTER_H */

diff --git a/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h b/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h
deleted file mode 100644
index 60dad42..0000000
--- a/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h
+++ /dev/null

@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLNONMAXIMASUPPRESSION3X3_H
-#define ARM_COMPUTE_CLNONMAXIMASUPPRESSION3X3_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class CLCompileContext;
-class ICLTensor;
-
-/** Basic function to execute non-maxima suppression over a 3x3 window. This function calls the following CL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLNonMaximaSuppression3x3Kernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class CLNonMaximaSuppression3x3 : public ICLSimpleFunction
-{
-public:
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note The implementation supports just 2 border modes: UNDEFINED and CONSTANT
-     *       The constant values used with CONSTANT border mode is 0
-     *
-     * @param[in,out] input       Source tensor. Data types supported: U8, F32. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output      Destination for the Non-Maxima suppressions 3x3. Data types supported: same as @p input.
-     * @param[in]     border_mode Border mode to use for non-maxima suppression.
-     *                                   The implementation supports just 2 border modes: UNDEFINED and CONSTANT
-     */
-    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note The implementation supports just 2 border modes: UNDEFINED and CONSTANT
-     *       The constant values used with CONSTANT border mode is 0
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in,out] input           Source tensor. Data types supported: U8, F32. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output          Destination for the Non-Maxima suppressions 3x3. Data types supported: same as @p input.
-     * @param[in]     border_mode     Border mode to use for non-maxima suppression.
-     *                                   The implementation supports just 2 border modes: UNDEFINED and CONSTANT
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode);
-};
-}
-#endif /* ARM_COMPUTE_CLNONMAXIMASUPPRESSION3X3_H */

diff --git a/arm_compute/runtime/CL/functions/CLOpticalFlow.h b/arm_compute/runtime/CL/functions/CLOpticalFlow.h
deleted file mode 100644
index 5c555f5..0000000
--- a/arm_compute/runtime/CL/functions/CLOpticalFlow.h
+++ /dev/null

@@ -1,146 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLOPTICALFLOW_H
-#define ARM_COMPUTE_CLOPTICALFLOW_H
-
-#include "arm_compute/core/IArray.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLArray.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class CLPyramid;
-class CLLKTrackerInitKernel;
-class CLLKTrackerStage0Kernel;
-class CLLKTrackerStage1Kernel;
-class CLLKTrackerFinalizeKernel;
-
-/** OpenCL Array of Internal Keypoints */
-using CLLKInternalKeypointArray = CLArray<CLLKInternalKeypoint>;
-/** OpenCL Array of Coefficient Tables */
-using CLCoefficientTableArray = CLArray<CLCoefficientTable>;
-/** OpenCL Array of Old Values */
-using CLOldValueArray = CLArray<CLOldValue>;
-
-/** Basic function to execute optical flow. This function calls the following OpenCL kernels and functions:
- *
- * -# @ref CLScharr3x3
- * -# @ref CLLKTrackerInitKernel
- * -# @ref CLLKTrackerStage0Kernel
- * -# @ref CLLKTrackerStage1Kernel
- * -# @ref CLLKTrackerFinalizeKernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class CLOpticalFlow : public IFunction
-{
-public:
-    /** Default constructor */
-    CLOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLOpticalFlow(const CLOpticalFlow &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLOpticalFlow &operator=(const CLOpticalFlow &) = delete;
-    /** Allow instances of this class to be moved */
-    CLOpticalFlow(CLOpticalFlow &&) = default;
-    /** Allow instances of this class to be moved */
-    CLOpticalFlow &operator=(CLOpticalFlow &&) = default;
-    /** Default destructor */
-    ~CLOpticalFlow();
-    /**  Initialise the function input and output
-     *
-     * @param[in]  old_pyramid           Pointer to the pyramid for the old tensor. Data types supported U8
-     * @param[in]  new_pyramid           Pointer to the pyramid for the new tensor. Data types supported U8
-     * @param[in]  old_points            Pointer to the IKeyPointArray storing old key points
-     * @param[in]  new_points_estimates  Pointer to the IKeyPointArray storing new estimates key points
-     * @param[out] new_points            Pointer to the IKeyPointArray storing new key points
-     * @param[in]  termination           The criteria to terminate the search of each keypoint.
-     * @param[in]  epsilon               The error for terminating the algorithm
-     * @param[in]  num_iterations        The maximum number of iterations before terminate the alogrithm
-     * @param[in]  window_dimension      The size of the window on which to perform the algorithm
-     * @param[in]  use_initial_estimate  The flag to indicate whether the initial estimated position should be used
-     * @param[in]  border_mode           The border mode applied at scharr kernel stage
-     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT
-     *
-     */
-    void configure(const CLPyramid *old_pyramid, const CLPyramid *new_pyramid,
-                   const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points,
-                   Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate,
-                   BorderMode border_mode, uint8_t constant_border_value = 0);
-    /**  Initialise the function input and output
-     *
-     * @param[in]  compile_context       The compile context to be used.
-     * @param[in]  old_pyramid           Pointer to the pyramid for the old tensor. Data types supported U8
-     * @param[in]  new_pyramid           Pointer to the pyramid for the new tensor. Data types supported U8
-     * @param[in]  old_points            Pointer to the IKeyPointArray storing old key points
-     * @param[in]  new_points_estimates  Pointer to the IKeyPointArray storing new estimates key points
-     * @param[out] new_points            Pointer to the IKeyPointArray storing new key points
-     * @param[in]  termination           The criteria to terminate the search of each keypoint.
-     * @param[in]  epsilon               The error for terminating the algorithm
-     * @param[in]  num_iterations        The maximum number of iterations before terminate the alogrithm
-     * @param[in]  window_dimension      The size of the window on which to perform the algorithm
-     * @param[in]  use_initial_estimate  The flag to indicate whether the initial estimated position should be used
-     * @param[in]  border_mode           The border mode applied at scharr kernel stage
-     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT
-     *
-     */
-    void configure(const CLCompileContext &compile_context, const CLPyramid *old_pyramid, const CLPyramid *new_pyramid,
-                   const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points,
-                   Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate,
-                   BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    MemoryGroup                                           _memory_group;
-    std::vector<std::unique_ptr<CLLKTrackerInitKernel>>   _tracker_init_kernel;
-    std::vector<std::unique_ptr<CLLKTrackerStage0Kernel>> _tracker_stage0_kernel;
-    std::vector<std::unique_ptr<CLLKTrackerStage1Kernel>> _tracker_stage1_kernel;
-    std::unique_ptr<CLLKTrackerFinalizeKernel>            _tracker_finalize_kernel;
-    std::vector<CLScharr3x3>                              _func_scharr;
-    std::vector<CLTensor>                                 _scharr_gx;
-    std::vector<CLTensor>                                 _scharr_gy;
-    const ICLKeyPointArray                               *_old_points;
-    const ICLKeyPointArray                               *_new_points_estimates;
-    ICLKeyPointArray                                     *_new_points;
-    std::unique_ptr<CLLKInternalKeypointArray>            _old_points_internal;
-    std::unique_ptr<CLLKInternalKeypointArray>            _new_points_internal;
-    std::unique_ptr<CLCoefficientTableArray>              _coefficient_table;
-    std::unique_ptr<CLOldValueArray>                      _old_values;
-    size_t                                                _num_levels;
-};
-}
-#endif /*ARM_COMPUTE_CLOPTICALFLOW_H */

diff --git a/arm_compute/runtime/CL/functions/CLPhase.h b/arm_compute/runtime/CL/functions/CLPhase.h
deleted file mode 100644
index 7c76c23..0000000
--- a/arm_compute/runtime/CL/functions/CLPhase.h
+++ /dev/null

@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLPHASE_H
-#define ARM_COMPUTE_CLPHASE_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class CLCompileContext;
-class ICLTensor;
-
-/** Basic function to execute an @ref CLMagnitudePhaseKernel.
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
-*/
-class CLPhase : public ICLSimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs, output.
-     *
-     * @param[in]  input1     First tensor input. Data types supported: S16.
-     * @param[in]  input2     Second tensor input. Data types supported: S16.
-     * @param[out] output     Output tensor. Data types supported: U8.
-     * @param[in]  phase_type (Optional) Phase calculation type. Default: SIGNED.
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type = PhaseType::SIGNED);
-    /** Initialise the kernel's inputs, output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          First tensor input. Data types supported: S16.
-     * @param[in]  input2          Second tensor input. Data types supported: S16.
-     * @param[out] output          Output tensor. Data types supported: U8.
-     * @param[in]  phase_type      (Optional) Phase calculation type. Default: SIGNED.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type = PhaseType::SIGNED);
-};
-}
-#endif /*ARM_COMPUTE_CLPHASE_H */

diff --git a/arm_compute/runtime/CL/functions/CLScharr3x3.h b/arm_compute/runtime/CL/functions/CLScharr3x3.h
deleted file mode 100644
index 4c747af..0000000
--- a/arm_compute/runtime/CL/functions/CLScharr3x3.h
+++ /dev/null

@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSCHARR3X3_H
-#define ARM_COMPUTE_CLSCHARR3X3_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class ICLTensor;
-
-/** Basic function to execute scharr 3x3 filter. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLScharr3x3Kernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class CLScharr3x3 : public ICLSimpleFunction
-{
-public:
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note At least one of output_x or output_y must be not NULL.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output_x              (optional) Destination for the Scharr 3x3 convolution along the X axis. Data types supported: S16.
-     * @param[out]    output_y              (optional) Destination for the Scharr 3x3 convolution along the Y axis. Data types supported: S16.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note At least one of output_x or output_y must be not NULL.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output_x              (optional) Destination for the Scharr 3x3 convolution along the X axis. Data types supported: S16.
-     * @param[out]    output_y              (optional) Destination for the Scharr 3x3 convolution along the Y axis. Data types supported: S16.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_CLSCHARR3X3_H */

diff --git a/arm_compute/runtime/CL/functions/CLSobel3x3.h b/arm_compute/runtime/CL/functions/CLSobel3x3.h
deleted file mode 100644
index 1e57453..0000000
--- a/arm_compute/runtime/CL/functions/CLSobel3x3.h
+++ /dev/null

@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSOBEL3X3_H
-#define ARM_COMPUTE_CLSOBEL3X3_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class ICLTensor;
-
-/** Basic function to execute sobel 3x3 filter. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLSobel3x3Kernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class CLSobel3x3 : public ICLSimpleFunction
-{
-public:
-    /** Default Constructor */
-    CLSobel3x3() = default;
-    /** Prevent instances of this class from being copied */
-    CLSobel3x3(const CLSobel3x3 &) = delete;
-    /** Prevent instances of this class from being copied */
-    CLSobel3x3 &operator=(const CLSobel3x3 &) = delete;
-    /** Default destructor */
-    ~CLSobel3x3();
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note At least one of output_x or output_y must be not NULL.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output_x              (optional) Destination for the Sobel 3x3 convolution along the X axis. Data types supported: S16.
-     * @param[out]    output_y              (optional) Destination for the Sobel 3x3 convolution along the Y axis. Data types supported: S16.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note At least one of output_x or output_y must be not NULL.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output_x              (optional) Destination for the Sobel 3x3 convolution along the X axis. Data types supported: S16.
-     * @param[out]    output_y              (optional) Destination for the Sobel 3x3 convolution along the Y axis. Data types supported: S16.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_CLSOBEL3X3_H */

diff --git a/arm_compute/runtime/CL/functions/CLSobel5x5.h b/arm_compute/runtime/CL/functions/CLSobel5x5.h
deleted file mode 100644
index e791d8a..0000000
--- a/arm_compute/runtime/CL/functions/CLSobel5x5.h
+++ /dev/null

@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSOBEL5X5_H
-#define ARM_COMPUTE_CLSOBEL5X5_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class CLFillBorderKernel;
-class CLSobel5x5HorKernel;
-class CLSobel5x5VertKernel;
-class ICLTensor;
-
-/** Basic function to execute sobel 5x5 filter. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLSobel5x5HorKernel
- * -# @ref CLSobel5x5VertKernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class CLSobel5x5 : public IFunction
-{
-public:
-    /** Default Constructor.
-     *
-     * @param[in] memory_manager (Optional) Memory manager.
-     */
-    CLSobel5x5(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied */
-    CLSobel5x5(const CLSobel5x5 &) = delete;
-    /** Prevent instances of this class from being copied */
-    CLSobel5x5 &operator=(const CLSobel5x5 &) = delete;
-    /** Default destructor */
-    ~CLSobel5x5();
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note At least one of output_x or output_y must be not NULL.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output_x              (optional) Destination for the Sobel 5x5 convolution along the X axis. Data types supported: S16.
-     * @param[out]    output_y              (optional) Destination for the Sobel 5x5 convolution along the Y axis. Data types supported: S16.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note At least one of output_x or output_y must be not NULL.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output_x              (optional) Destination for the Sobel 5x5 convolution along the X axis. Data types supported: S16.
-     * @param[out]    output_y              (optional) Destination for the Sobel 5x5 convolution along the Y axis. Data types supported: S16.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    // Inherited methods overridden:
-    void run() override;
-
-protected:
-    MemoryGroup                           _memory_group;   /**< Function's memory group */
-    std::unique_ptr<CLSobel5x5HorKernel>  _sobel_hor;      /**< Sobel Horizontal 5x5 kernel */
-    std::unique_ptr<CLSobel5x5VertKernel> _sobel_vert;     /**< Sobel Vertical 5x5 kernel */
-    std::unique_ptr<CLFillBorderKernel>   _border_handler; /**< Kernel to handle image borders */
-    CLImage                               _tmp_x;          /**< Temporary buffer for Sobel X */
-    CLImage                               _tmp_y;          /**< Temporary buffer for Sobel Y */
-};
-}
-#endif /*ARM_COMPUTE_CLSOBEL5X5_H */

diff --git a/arm_compute/runtime/CL/functions/CLSobel7x7.h b/arm_compute/runtime/CL/functions/CLSobel7x7.h
deleted file mode 100644
index 65e8de5..0000000
--- a/arm_compute/runtime/CL/functions/CLSobel7x7.h
+++ /dev/null

@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSOBEL7X7_H
-#define ARM_COMPUTE_CLSOBEL7X7_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class CLFillBorderKernel;
-class CLSobel7x7HorKernel;
-class CLSobel7x7VertKernel;
-class ICLTensor;
-
-/** Basic function to execute sobel 7x7 filter. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLSobel7x7HorKernel
- * -# @ref CLSobel7x7VertKernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- * 
- */
-class CLSobel7x7 : public IFunction
-{
-public:
-    /** Default Constructor.
-     *
-     * @param[in] memory_manager (Optional) Memory manager.
-     */
-    CLSobel7x7(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied */
-    CLSobel7x7(const CLSobel7x7 &) = delete;
-    /** Prevent instances of this class from being copied */
-    CLSobel7x7 &operator=(const CLSobel7x7 &) = delete;
-    /** Default destructor */
-    ~CLSobel7x7();
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note At least one of output_x or output_y must be not NULL.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output_x              (optional) Destination for the Sobel 7x7 convolution along the X axis. Data types supported: S32.
-     * @param[out]    output_y              (optional) Destination for the Sobel 7x7 convolution along the Y axis. Data types supported: S32.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note At least one of output_x or output_y must be not NULL.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output_x              (optional) Destination for the Sobel 7x7 convolution along the X axis. Data types supported: S32.
-     * @param[out]    output_y              (optional) Destination for the Sobel 7x7 convolution along the Y axis. Data types supported: S32.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    // Inherited methods overridden:
-    void run() override;
-
-protected:
-    MemoryGroup                           _memory_group;   /**< Function's memory group */
-    std::unique_ptr<CLSobel7x7HorKernel>  _sobel_hor;      /**< Sobel Horizontal 7x7 kernel */
-    std::unique_ptr<CLSobel7x7VertKernel> _sobel_vert;     /**< Sobel Vertical 7x7 kernel */
-    std::unique_ptr<CLFillBorderKernel>   _border_handler; /**< Kernel to handle image borders */
-    CLImage                               _tmp_x;          /**< Temporary buffer for Sobel X */
-    CLImage                               _tmp_y;          /**< Temporary buffer for Sobel Y */
-};
-}
-#endif /*ARM_COMPUTE_CLSOBEL7X7_H */

diff --git a/arm_compute/runtime/CL/functions/CLTableLookup.h b/arm_compute/runtime/CL/functions/CLTableLookup.h
deleted file mode 100644
index ca59309..0000000
--- a/arm_compute/runtime/CL/functions/CLTableLookup.h
+++ /dev/null

@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLTABLELOOKUP_H
-#define ARM_COMPUTE_CLTABLELOOKUP_H
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class CLCompileContext;
-class ICLTensor;
-class ICLLut;
-
-/** Basic function to run @ref CLTableLookupKernel */
-class CLTableLookup : public ICLSimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]  input  First tensor input. Data types supported: U8 and S16
-     * @param[in]  lut    Input lookup table. Data types supported: U8 and S16
-     * @param[out] output Output tensor. Data types supported: U8 and S16
-     */
-    void configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output);
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           First tensor input. Data types supported: U8 and S16
-     * @param[in]  lut             Input lookup table. Data types supported: U8 and S16
-     * @param[out] output          Output tensor. Data types supported: U8 and S16
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLLut *lut, ICLTensor *output);
-};
-}
-#endif /*ARM_COMPUTE_CLTABLELOOKUP_H */

diff --git a/arm_compute/runtime/CL/functions/CLThreshold.h b/arm_compute/runtime/CL/functions/CLThreshold.h
deleted file mode 100644
index a681748..0000000
--- a/arm_compute/runtime/CL/functions/CLThreshold.h
+++ /dev/null

@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLTHRESHOLD_H
-#define ARM_COMPUTE_CLTHRESHOLD_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-// Forward declarations
-class CLCompileContext;
-class ICLTensor;
-
-/** Basic function to run @ref CLThresholdKernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
-*/
-class CLThreshold : public ICLSimpleFunction
-{
-public:
-    /** Initialise the function's source, destination, thresholds and threshold type
-     *
-     * @param[in]  input  First tensor input. Data types supported: U8.
-     * @param[out] output Output tensor. Data types supported: U8.
-     * @param[in]  info   Threshold  descriptor
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info);
-    /** Initialise the function's source, destination, thresholds and threshold type
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           First tensor input. Data types supported: U8.
-     * @param[out] output          Output tensor. Data types supported: U8.
-     * @param[in]  info            Threshold descriptor
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info);
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLTHRESHOLD_H */

diff --git a/arm_compute/runtime/CL/functions/CLWarpAffine.h b/arm_compute/runtime/CL/functions/CLWarpAffine.h
deleted file mode 100644
index 2f73097..0000000
--- a/arm_compute/runtime/CL/functions/CLWarpAffine.h
+++ /dev/null

@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWARPAFFINE_H
-#define ARM_COMPUTE_CLWARPAFFINE_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class ICLTensor;
-
-/** Basic function to run @ref CLWarpAffineKernel for AFFINE transformation
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
-*/
-class CLWarpAffine : public ICLSimpleFunction
-{
-public:
-    /** Initialize the function's source, destination, interpolation policy and border_mode.
-     *
-     * @param[in,out] input                 Source temspr. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8.
-     * @param[in]     matrix                The affine matrix. Must be 2x3 of type float.
-     *                                      The matrix argument requires 9 values, the last 3 values are ignored.
-     * @param[in]     policy                The interpolation type.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialize the function's source, destination, interpolation policy and border_mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source temspr. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8.
-     * @param[in]     matrix                The affine matrix. Must be 2x3 of type float.
-     *                                      The matrix argument requires 9 values, the last 3 values are ignored.
-     * @param[in]     policy                The interpolation type.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode,
-                   uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_CLWARPAFFINE_H */

diff --git a/arm_compute/runtime/CL/functions/CLWarpPerspective.h b/arm_compute/runtime/CL/functions/CLWarpPerspective.h
deleted file mode 100644
index 4e2c81e..0000000
--- a/arm_compute/runtime/CL/functions/CLWarpPerspective.h
+++ /dev/null

@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWARPPERSPECTIVE_H
-#define ARM_COMPUTE_CLWARPPERSPECTIVE_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class ICLTensor;
-
-/** Basic function to run @ref CLWarpPerspectiveKernel for PERSPECTIVE transformation
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
-*/
-class CLWarpPerspective : public ICLSimpleFunction
-{
-public:
-    /** Initialize the function's source, destination, interpolation policy and border_mode.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor. Data types supported: U8.
-     * @param[in]     matrix                The perspective matrix. Must be 3x3 of type float.
-     * @param[in]     policy                The interpolation type.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialize the function's source, destination, interpolation policy and border_mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor. Data types supported: U8.
-     * @param[in]     matrix                The perspective matrix. Must be 3x3 of type float.
-     * @param[in]     policy                The interpolation type.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode,
-                   uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_CLWARPPERSPECTIVE_H */

diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h
index 8b6649c..863a8a6 100644
--- a/arm_compute/runtime/NEON/NEFunctions.h
+++ b/arm_compute/runtime/NEON/NEFunctions.h

@@ -39,7 +39,6 @@
 #include "arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
-#include "arm_compute/runtime/NEON/functions/NEConvolution.h"
 #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
 #include "arm_compute/runtime/NEON/functions/NECopy.h"
 #include "arm_compute/runtime/NEON/functions/NECropResize.h"
@@ -75,7 +74,6 @@
 #include "arm_compute/runtime/NEON/functions/NELogical.h"
 #include "arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h"
 #include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPReluLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPadLayer.h"
@@ -91,6 +89,7 @@
 #include "arm_compute/runtime/NEON/functions/NERange.h"
 #include "arm_compute/runtime/NEON/functions/NEReduceMean.h"
 #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
+#include "arm_compute/runtime/NEON/functions/NERemap.h"
 #include "arm_compute/runtime/NEON/functions/NEReorgLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEReverse.h"

diff --git a/arm_compute/runtime/NEON/functions/NEConvolution.h b/arm_compute/runtime/NEON/functions/NEConvolution.h
deleted file mode 100644
index f2d7ae8..0000000
--- a/arm_compute/runtime/NEON/functions/NEConvolution.h
+++ /dev/null

@@ -1,178 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECONVOLUTION_H
-#define ARM_COMPUTE_NECONVOLUTION_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class ITensor;
-class NEFillBorderKernel;
-template <unsigned int matrix_size>
-class NEConvolutionKernel;
-template <unsigned int matrix_size>
-class NESeparableConvolutionHorKernel;
-template <unsigned int matrix_size>
-class NESeparableConvolutionVertKernel;
-
-/** Basic function to execute convolution of size 3x3. This function calls the following Neon kernels:
- *
- * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref NEConvolution3x3Kernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class NEConvolution3x3 : public INESimpleFunction
-{
-public:
-    /** Constructor */
-    NEConvolution3x3() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEConvolution3x3(const NEConvolution3x3 &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEConvolution3x3 &operator=(const NEConvolution3x3 &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEConvolution3x3(NEConvolution3x3 &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEConvolution3x3 &operator=(NEConvolution3x3 &&) = delete;
-    /** Default destructor */
-    ~NEConvolution3x3();
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in,out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8/S16.
-     * @param[in]     conv                  Matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
-     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-
-/** Basic function to execute convolution of size 5x5, 7x7, 9x9. This function calls the following Neon kernels:
- *
- * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref NEConvolutionKernel or<br/>
- *    @ref NESeparableConvolutionHorKernel and @ref NESeparableConvolutionVertKernel (if convolution matrix is separable)
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-template <unsigned int matrix_size>
-class NEConvolutionSquare : public IFunction
-{
-public:
-    /** Default constructor */
-    NEConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEConvolutionSquare(const NEConvolutionSquare &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEConvolutionSquare &operator=(const NEConvolutionSquare &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEConvolutionSquare(NEConvolutionSquare &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEConvolutionSquare &operator=(NEConvolutionSquare &&) = delete;
-    /** Default destructor */
-    ~NEConvolutionSquare();
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in,out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
-     * @param[in]     conv                  matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
-     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    MemoryGroup                                                    _memory_group;   /**< Function memory group */
-    Tensor                                                         _tmp;            /**< temporary buffer for output of horizontal pass */
-    bool                                                           _is_separable;   /**< true if the convolution can be separated */
-    std::unique_ptr<NESeparableConvolutionHorKernel<matrix_size>>  _kernel_hor;     /**< kernel for horizontal pass of separated convolution */
-    std::unique_ptr<NESeparableConvolutionVertKernel<matrix_size>> _kernel_vert;    /**< kernel for vertical pass of separated convolution */
-    std::unique_ptr<NEConvolutionKernel<matrix_size>>              _kernel;         /**< kernel for non-separated convolution **/
-    std::unique_ptr<NEFillBorderKernel>                            _border_handler; /**< kernel for border handling */
-};
-
-/** Basic function to run 5x5 convolution. */
-using NEConvolution5x5 = NEConvolutionSquare<5>;
-/** Basic function to run 7x7 convolution. */
-using NEConvolution7x7 = NEConvolutionSquare<7>;
-/** Basic function to run 9x9 convolution. */
-using NEConvolution9x9 = NEConvolutionSquare<9>;
-
-/** Basic function to execute non-square convolution. This function calls the following Neon kernels:
- *
- * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref NEConvolutionRectangleKernel or<br/>
- *
- * @note Convolution rectangle should have dimensions of 3, 5, 7, 9
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class NEConvolutionRectangle : public INESimpleFunction
-{
-public:
-    /** Constructor */
-    NEConvolutionRectangle() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEConvolutionRectangle(const NEConvolutionRectangle &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEConvolutionRectangle &operator=(const NEConvolutionRectangle &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEConvolutionRectangle(NEConvolutionRectangle &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEConvolutionRectangle &operator=(NEConvolutionRectangle &&) = delete;
-    /** Default destructor */
-    ~NEConvolutionRectangle();
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in,out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
-     * @param[in]     conv                  Matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
-     * @param[in]     rows                  Rows of convolution kernel.
-     * @param[in]     cols                  Columns of convolution kernel.
-     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_NECONVOLUTION_H */

diff --git a/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h b/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h
deleted file mode 100644
index 2fff72d..0000000
--- a/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h
+++ /dev/null

@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NENONMAXIMASUPPRESSION3X3_H
-#define ARM_COMPUTE_NENONMAXIMASUPPRESSION3X3_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute non-maxima suppression over a 3x3 window. This function calls the following Neon kernels:
- *
- * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref NENonMaximaSuppression3x3Kernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
- */
-class NENonMaximaSuppression3x3 : public INESimpleFunction
-{
-public:
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note The implementation supports just 2 border modes: UNDEFINED and CONSTANT
-     *       The constant values used with CONSTANT border mode is 0
-     *
-     * @param[in, out] input       Source tensor. Data type supported: U8/F32. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output      Destination for the Non-Maxima suppressions 3x3. Data type supported: same as @p input
-     * @param[in]      border_mode Border mode to use for non-maxima suppression. The implementation supports just 2 border modes: UNDEFINED and CONSTANT
-     *
-     */
-    void configure(ITensor *input, ITensor *output, BorderMode border_mode);
-};
-}
-#endif /* ARM_COMPUTE_NENONMAXIMASUPPRESSION3X3_H */

diff --git a/arm_compute/runtime/NEON/functions/NERemap.h b/arm_compute/runtime/NEON/functions/NERemap.h
new file mode 100644
index 0000000..84d0f2e
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NERemap.h

@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEREMAP_H
+#define ARM_COMPUTE_NEREMAP_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute remap. This function calls the following Neon kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NERemapKernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+ */
+class NERemap : public INESimpleFunction
+{
+public:
+    /** Initialise the function's sources, destination, interpolation policy and border mode.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[in]      map_x                 Map for X coordinates. Data type supported: F32.
+     * @param[in]      map_y                 Map for Y coordinates. Data type supported: F32.
+     * @param[out]     output                Output tensor. Data type supported: U8.
+     * @param[in]      policy                Interpolation policy to use. Only NEAREST and BILINEAR are supported.
+     * @param[in]      border_mode           Border mode to use on the input tensor.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output,
+                   InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*ARM_COMPUTE_NEREMAP_H */

diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index 7659d56..8616cb6 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox

@@ -27,7 +27,7 @@
 
 @tableofcontents
 
-The Computer Vision and Machine Learning library is a set of functions optimised for both Arm CPUs and GPUs using SIMD technologies.
+The Compute Library is a collection of low-level machine learning functions optimized for both Arm CPUs and GPUs using SIMD technologies.
 
 Several builds of the library are available using various configurations:
  - OS: Linux, Android, macOS or bare metal.
@@ -95,6 +95,7 @@
    - NEChannelCombine
    - NEChannelExtract
    - NEColorConvert
+   - NEConvolution
    - NEDerivative
    - NEDilate
    - NEEqualizeHistogram
@@ -119,7 +120,6 @@
    - NENonLinearFilter
    - NEOpticalFlow
    - NEPhase
-   - NERemap
    - NEScharr3x3
    - NESobel3x3
    - NESobel5x5
@@ -127,9 +127,52 @@
    - NETableLookup
    - NEThreshold
    - NEWarpAffine
-   - NEWarpPerspective
- - Remove all GLES kernels / functions / tests / examples
+   - NEWarpPerspectiveKernel
 
+ - Remove all GLES kernels / functions / tests / examples
+ - Removed computer vision support from CL backend
+ - Removed the following functions:
+   - CLAbsoluteDifference
+   - CLAccumulate
+   - CLBox3x3
+   - CLCannyEdge
+   - CLChannelCombine
+   - CLChannelExtract
+   - CLColorConvert
+   - CLConvolution
+   - CLDerivative
+   - CLDilate
+   - CLEqualizeHistogram
+   - CLErode
+   - CLFastCorners
+   - CLGaussian3x3
+   - CLGaussian5x5
+   - CLGaussianPyramid
+   - CLHOGDescriptor
+   - CLHOGDetector
+   - CLHOGGradient
+   - CLHOGMultiDetection
+   - CLHarrisCorners
+   - CLHistogram
+   - CLIntegralImage
+   - CLLaplacianPyramid
+   - CLLaplacianReconstruct
+   - CLMagnitude
+   - CLMeanStdDev
+   - CLMedian3x3
+   - CLMinMaxLocation
+   - CLNonLinearFilter
+   - CLOpticalFlow
+   - CLPhase
+   - CLScharr3x3
+   - CLSobel3x3
+   - CLSobel5x5
+   - CLSobel7x7
+   - CLTableLookup
+   - CLThreshold
+   - CLWarpAffine
+   - CLWarpPerspective
+ 
 v21.02 Public major release
  - Various bug fixes.
  - Various optimisations.
@@ -212,8 +255,8 @@
    - @ref NELogicalOr
  - Removed padding from Neon kernels:
    - @ref NEComplexPixelWiseMultiplicationKernel
-   - @ref NENonMaximaSuppression3x3Kernel
-   - NERemapKernel
+   - NENonMaximaSuppression3x3Kernel
+   - @ref NERemapKernel
    - @ref NEGEMMInterleave4x4Kernel
    - @ref NEDirectConvolutionLayerKernel
    - @ref NEScaleKernel
@@ -221,7 +264,7 @@
    - @ref NEGEMMLowpOffsetContributionKernel
    - @ref NEGEMMTranspose1xWKernel
    - NEPoolingLayerKernel
-   - @ref NEConvolutionKernel
+   - NEConvolutionKernel
    - @ref NEDepthwiseConvolutionLayerNativeKernel
    - @ref NEGEMMLowpMatrixMultiplyKernel
    - @ref NEGEMMMatrixMultiplyKernel
@@ -534,7 +577,7 @@
    - NEGEMMLowpQuantizeDownInt32ToUint8Scale
    - NEGEMMMatrixAccumulateBiasesKernel
  - Deprecated functions / interfaces:
-   - Non-descriptor based interfaces for NEThreshold, @ref CLThreshold
+   - Non-descriptor based interfaces for NEThreshold, CLThreshold
    - Non-descriptor based interfaces for @ref NEScale, @ref CLScale and GCScale
    - In @ref NESoftmaxLayer, @ref NELogSoftmaxLayer, @ref CLSoftmaxLayer, @ref CLLogSoftmaxLayer and GCSoftmaxLayer :
       The default "axis" value for @ref CLSoftmaxLayer, @ref CLLogSoftmaxLayer and GCSoftmaxLayer is changed from 1 to 0.
@@ -791,7 +834,7 @@
     - @ref CLGEMMLowpMatrixMultiplyNativeKernel
     - CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
     - @ref CLGEMMMatrixMultiplyNativeKernel
-    - @ref CLMeanStdDevNormalizationKernel / @ref CLMeanStdDevNormalizationLayer
+    - CLMeanStdDevNormalizationKernel /CLMeanStdDevNormalizationLayer
     - @ref CLSpaceToDepthLayerKernel / @ref CLSpaceToDepthLayer
  - New examples:
     - neon_opticalflow
@@ -1286,7 +1329,7 @@
  - New OpenCL kernels / functions:
     - @ref CLBatchNormalizationLayerKernel / @ref CLBatchNormalizationLayer
     - CLDepthConcatenateLayerKernel / CLDepthConcatenateLayer
-    - @ref CLHOGOrientationBinningKernel @ref CLHOGBlockNormalizationKernel, @ref CLHOGDetectorKernel / @ref CLHOGDescriptor @ref CLHOGDetector @ref CLHOGGradient @ref CLHOGMultiDetection
+    - CLHOGOrientationBinningKernel CLHOGBlockNormalizationKernel, CLHOGDetectorKernel / CLHOGDescriptor CLHOGDetector CLHOGGradient CLHOGMultiDetection
     - CLLocallyConnectedMatrixMultiplyKernel / CLLocallyConnectedLayer
     - @ref CLWeightsReshapeKernel / @ref CLConvolutionLayerReshapeWeights
  - New C++ kernels:
@@ -1308,12 +1351,12 @@
 v17.04 Public bug fixes release
 
  The following functions have been ported to use the new accurate padding:
- -  @ref CLColorConvertKernel
- -  @ref CLEdgeNonMaxSuppressionKernel
- -  @ref CLEdgeTraceKernel
- -  @ref CLGaussianPyramidHorKernel
- -  @ref CLGaussianPyramidVertKernel
- -  @ref CLGradientKernel
+ -  CLColorConvertKernel
+ -  CLEdgeNonMaxSuppressionKernel
+ -  CLEdgeTraceKernel
+ -  CLGaussianPyramidHorKernel
+ -  CLGaussianPyramidVertKernel
+ -  CLGradientKernel
  -  NEChannelCombineKernel
  -  @ref NEFillArrayKernel
  -  NEGaussianPyramidHorKernel
@@ -1324,8 +1367,8 @@
  -  NELogits1DMaxKernel
  -  NELogits1DShiftExpSumKernel
  -  NELogits1DNormKernel
- -  @ref NENonMaximaSuppression3x3FP16Kernel
- -  @ref NENonMaximaSuppression3x3Kernel
+ -  NENonMaximaSuppression3x3FP16Kernel
+ -  NENonMaximaSuppression3x3Kernel
 
 v17.03.1 First Major public release of the sources
  - Renamed the library to arm_compute
@@ -1343,13 +1386,13 @@
 
 v17.03 Sources preview
  - New OpenCL kernels / functions:
-   - @ref CLGradientKernel, @ref CLEdgeNonMaxSuppressionKernel, @ref CLEdgeTraceKernel / @ref CLCannyEdge
+   - CLGradientKernel, CLEdgeNonMaxSuppressionKernel, CLEdgeTraceKernel / CLCannyEdge
    - GEMM refactoring + FP16 support: CLGEMMInterleave4x4Kernel, CLGEMMTranspose1xWKernel, @ref CLGEMMMatrixMultiplyKernel, CLGEMMMatrixAdditionKernel / @ref CLGEMM
    - CLGEMMMatrixAccumulateBiasesKernel / @ref CLFullyConnectedLayer
    - @ref CLTransposeKernel / @ref CLTranspose
-   - @ref CLLKTrackerInitKernel, @ref CLLKTrackerStage0Kernel, @ref CLLKTrackerStage1Kernel, @ref CLLKTrackerFinalizeKernel / @ref CLOpticalFlow
+   - @ref CLLKTrackerInitKernel, @ref CLLKTrackerStage0Kernel, @ref CLLKTrackerStage1Kernel, @ref CLLKTrackerFinalizeKernel / CLOpticalFlow
    - @ref CLNormalizationLayerKernel / @ref CLNormalizationLayer
-   - @ref CLLaplacianPyramid, @ref CLLaplacianReconstruct
+   - CLLaplacianPyramid, CLLaplacianReconstruct
  - New Neon kernels / functions:
    - NEActivationLayerKernel / @ref NEActivationLayer
    - GEMM refactoring + FP16 support (Requires armv8.2 CPU): @ref NEGEMMInterleave4x4Kernel, @ref NEGEMMTranspose1xWKernel, @ref NEGEMMMatrixMultiplyKernel, @ref NEGEMMMatrixAdditionKernel / @ref NEGEMM
@@ -1359,23 +1402,23 @@
  - New OpenCL kernels / functions:
    - CLLogits1DMaxKernel, CLLogits1DShiftExpSumKernel, @ref CLLogits1DNormKernel / @ref CLSoftmaxLayer
    - CLPoolingLayerKernel / @ref CLPoolingLayer
-   - @ref CLIm2ColKernel, @ref CLCol2ImKernel, CLConvolutionLayerWeightsReshapeKernel / @ref CLConvolutionLayer
+   - @ref CLIm2ColKernel, @ref CLCol2ImKernel, CLConvolutionLayerWeightsReshapeKernel / CLConvolutionLayer
    - @ref CLRemapKernel / @ref CLRemap
-   - @ref CLGaussianPyramidHorKernel, @ref CLGaussianPyramidVertKernel / @ref CLGaussianPyramid, @ref CLGaussianPyramidHalf, @ref CLGaussianPyramidOrb
-   - @ref CLMinMaxKernel, @ref CLMinMaxLocationKernel / @ref CLMinMaxLocation
-   - @ref CLNonLinearFilterKernel / @ref CLNonLinearFilter
+   - CLGaussianPyramidHorKernel, CLGaussianPyramidVertKernel / CLGaussianPyramid, CLGaussianPyramidHalf, CLGaussianPyramidOrb
+   - CLMinMaxKernel, CLMinMaxLocationKernel / CLMinMaxLocation
+   - CLNonLinearFilterKernel / CLNonLinearFilter
  - New Neon FP16 kernels (Requires armv8.2 CPU)
    - NEAccumulateWeightedFP16Kernel
    - NEBox3x3FP16Kernel
-   - @ref NENonMaximaSuppression3x3FP16Kernel
+   - NENonMaximaSuppression3x3FP16Kernel
 
 v17.02 Sources preview
  - New OpenCL kernels / functions:
    - CLActivationLayerKernel / @ref CLActivationLayer
-   - @ref CLChannelCombineKernel / @ref CLChannelCombine
-   - @ref CLDerivativeKernel / @ref CLChannelExtract
-   - @ref CLFastCornersKernel / @ref CLFastCorners
-   - @ref CLMeanStdDevKernel / @ref CLMeanStdDev
+   - CLChannelCombineKernel / CLChannelCombine
+   - CLDerivativeKernel / CLChannelExtract
+   - CLFastCornersKernel / CLFastCorners
+   - CLMeanStdDevKernel / CLMeanStdDev
  - New Neon kernels / functions:
    - HOG / SVM: NEHOGOrientationBinningKernel, NEHOGBlockNormalizationKernel, NEHOGDetectorKernel, NEHOGNonMaximaSuppressionKernel / NEHOGDescriptor, NEHOGDetector, NEHOGGradient, NEHOGMultiDetection
    - NENonLinearFilterKernel / NENonLinearFilter
@@ -1893,8 +1936,6 @@
 
 Enabling 16-bit floating point calculations require \a cl_khr_fp16 extension to be supported. All Mali GPUs with compute capabilities have native support for half precision floating points.
 
-Use of @ref CLMeanStdDev function requires 64-bit atomics support, thus \a cl_khr_int64_base_atomics should be supported in order to use.
-
 @subsubsection S3_7_2_cl_performance_requirements Performance improvements
 
 Integer dot product built-in function extensions (and therefore optimized kernels) are available with Mali OpenCL DDK r22p0 and above for the following GPUs : G71, G76. The relevant extensions are \a cl_arm_integer_dot_product_int8, \a cl_arm_integer_dot_product_accumulate_int8 and \a cl_arm_integer_dot_product_accumulate_int16.

diff --git a/docs/01_library.dox b/docs/01_library.dox
index 641fc3e..5cd33b6 100644
--- a/docs/01_library.dox
+++ b/docs/01_library.dox

@@ -191,7 +191,7 @@
 
 Functions will automatically allocate the temporary buffers mentioned above, and will automatically multi-thread kernels' executions using the very basic scheduler described in the previous section.
 
-Simple functions only call a single kernel (e.g @ref NEConvolution3x3), while more complex ones consist of several kernels pipelined together (e.g @ref NEFullyConnectedLayer ). Check their documentation to find out which kernels are used by each function.
+Simple functions only call a single kernel (e.g NEConvolution3x3), while more complex ones consist of several kernels pipelined together (e.g @ref NEFullyConnectedLayer ). Check their documentation to find out which kernels are used by each function.
 
 @code{.cpp}
 //Create a function object:
@@ -225,9 +225,6 @@
 
 In order to block until all the jobs in the CLScheduler's command queue are done executing the user can call @ref CLScheduler::sync() or create a sync event using @ref CLScheduler::enqueue_sync_event()
 
-For example:
-@snippet cl_events.cpp OpenCL events
-
 @subsection S4_4_2_cl_neon OpenCL / Neon interoperability
 
 You can mix OpenCL and Neon kernels and functions. However it is the user's responsibility to handle the mapping/unmapping of OpenCL objects.
@@ -260,8 +257,6 @@
 
 - Accurate padding:
 
-@snippet neon_convolution.cpp Accurate padding
-
 @note It's important to call allocate @b after the function is configured: if the image / tensor is already allocated then the function will shrink its execution window instead of increasing the padding. (See below for more details).
 
 - Manual padding / no padding / auto padding: You can allocate your images / tensors up front (before configuring your functions). In that case the function will use whatever padding is available and will shrink its execution window if there isn't enough padding available (which translates into a smaller valid region for the output). See also @ref valid_region).

diff --git a/docs/06_functions_list.dox b/docs/06_functions_list.dox
index 1f4794f..0c5145c 100644
--- a/docs/06_functions_list.dox
+++ b/docs/06_functions_list.dox

@@ -38,8 +38,6 @@
         - @ref NEBoundingBoxTransform
         - @ref NECast
         - @ref NEComplexPixelWiseMultiplication
-        - @ref NEConvolution3x3
-        - @ref NEConvolutionRectangle
         - @ref NEElementwiseComparison
         - @ref NEElementwiseComparisonStatic
         - @ref NEElementwiseDivision
@@ -50,7 +48,6 @@
         - @ref NELogicalAnd
         - @ref NELogicalNot
         - @ref NELogicalOr
-        - @ref NENonMaximaSuppression3x3
         - @ref NEPixelWiseMultiplication
         - @ref NEPReluLayer
         - @ref NEROIAlignLayer
@@ -90,7 +87,6 @@
     - @ref NEConvertFullyConnectedWeights
     - @ref NEConvolutionLayer
     - @ref NEConvolutionLayerReshapeWeights
-    - @ref NEConvolutionSquare &lt;matrix_size&gt;
     - @ref NECropResize
     - @ref NEDeconvolutionLayer
     - @ref NEDepthwiseConvolutionAssemblyDispatch
@@ -140,12 +136,10 @@
 - @ref IFunction
     - @ref CLBatchNormalizationLayer
     - @ref CLBatchToSpaceLayer
-    - @ref CLCannyEdge
     - @ref CLComplexPixelWiseMultiplication
     - @ref CLConcatenateLayer
     - @ref CLConvolutionLayer
     - @ref CLConvolutionLayerReshapeWeights
-    - @ref CLConvolutionSquare &lt;matrix_size&gt;
     - @ref CLCropResize
     - @ref CLDeconvolutionLayer
     - @ref CLDeconvolutionLayerUpsample
@@ -154,32 +148,17 @@
     - @ref CLDequantizationLayer
     - @ref CLDirectConvolutionLayer
     - @ref CLDirectDeconvolutionLayer
-    - @ref CLEqualizeHistogram
-    - @ref CLFastCorners
     - @ref CLFFT1D
     - @ref CLFFT2D
     - @ref CLFFTConvolutionLayer
     - @ref CLFullyConnectedLayer
     - @ref CLFuseBatchNormalization
-    - @ref CLGaussian5x5
-    - @ref CLGaussianPyramid
-        - @ref CLGaussianPyramidHalf
-        - @ref CLGaussianPyramidOrb
     - @ref CLGEMM
     - @ref CLGEMMConvolutionLayer
     - @ref CLGEMMDeconvolutionLayer
     - @ref CLGEMMLowpMatrixMultiplyCore
     - @ref CLGenerateProposalsLayer
-    - @ref CLHarrisCorners
-    - @ref CLHistogram
-    - @ref CLHOGDescriptor
-    - @ref CLHOGDetector
-    - @ref CLHOGGradient
-    - @ref CLHOGMultiDetection
-    - @ref CLIntegralImage
     - @ref CLL2NormalizeLayer
-    - @ref CLLaplacianPyramid
-    - @ref CLLaplacianReconstruct
     - @ref CLLogicalAnd
     - @ref CLLogicalNot
     - @ref CLLogicalOr
@@ -187,18 +166,13 @@
     - @ref CLLSTMLayerQuantized
     - @ref CLQLSTMLayer
     - @ref CLMaxUnpoolingLayer
-    - @ref CLMeanStdDev
-    - @ref CLMinMaxLocation
     - @ref CLNormalizationLayer
     - @ref CLNormalizePlanarYUVLayer
-    - @ref CLOpticalFlow
     - @ref CLPadLayer
     - @ref CLQuantizationLayer
     - @ref CLReduceMean
     - @ref CLReductionOperation
     - @ref CLRNNLayer
-    - @ref CLSobel5x5
-    - @ref CLSobel7x7
     - @ref CLSoftmaxLayerGeneric &lt;IS_LOG&gt;
     - @ref CLSpaceToBatchLayer
     - @ref CLSpaceToDepthLayer
@@ -207,10 +181,6 @@
     - @ref CLUnstack
     - @ref CLWinogradConvolutionLayer
     - @ref ICLSimpleFunction
-        - @ref CLAbsoluteDifference
-        - @ref CLAccumulate
-        - @ref CLAccumulateSquared
-        - @ref CLAccumulateWeighted
         - @ref CLActivationLayer
         - @ref CLArgMinMaxLayer
         - @ref CLArithmeticAddition
@@ -221,25 +191,16 @@
         - @ref CLBitwiseOr
         - @ref CLBitwiseXor
         - @ref CLBoundingBoxTransform
-        - @ref CLBox3x3
         - @ref CLCast
-        - @ref CLChannelCombine
-        - @ref CLChannelExtract
         - @ref CLChannelShuffleLayer
-        - @ref CLColorConvert
         - @ref CLComparison
         - @ref CLComparisonStatic
         - @ref CLConvertFullyConnectedWeights
-        - @ref CLConvolution3x3
-        - @ref CLConvolutionRectangle
         - @ref CLCopy
         - @ref CLDepthConvertLayer
-        - @ref CLDerivative
-        - @ref CLDilate
         - @ref CLElementwiseMax
         - @ref CLElementwiseMin
         - @ref CLElementwiseSquaredDiff
-        - @ref CLErode
         - @ref CLExpLayer
         - @ref CLFill
         - @ref CLFillBorder
@@ -247,23 +208,16 @@
         - @ref CLFloor
         - @ref CLFullyConnectedLayerReshapeWeights
         - @ref CLGather
-        - @ref CLGaussian3x3
         - @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
         - @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
         - @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
-        - @ref CLMagnitude
         - @ref CLMeanStdDevNormalizationLayer
-        - @ref CLMedian3x3
-        - @ref CLNonLinearFilter
-        - @ref CLNonMaximaSuppression3x3
         - @ref CLPermute
-        - @ref CLPhase
         - @ref CLPixelWiseMultiplication
         - @ref CLPoolingLayer
         - @ref CLPReluLayer
         - @ref CLPriorBoxLayer
         - @ref CLRange
-        - @ref CLRemap
         - @ref CLReorgLayer
         - @ref CLReshapeLayer
         - @ref CLReverse
@@ -271,17 +225,11 @@
         - @ref CLROIPoolingLayer
         - @ref CLRsqrtLayer
         - @ref CLScale
-        - @ref CLScharr3x3
         - @ref CLSelect
         - @ref CLSlice
-        - @ref CLSobel3x3
         - @ref CLStridedSlice
-        - @ref CLTableLookup
-        - @ref CLThreshold
         - @ref CLTile
         - @ref CLTranspose
-        - @ref CLWarpAffine
-        - @ref CLWarpPerspective
         - @ref CLWinogradInputTransform
 
 @section S6_3 CPP functions

diff --git a/examples/cl_convolution.cpp b/examples/cl_convolution.cpp
deleted file mode 100644
index bfa53f3..0000000
--- a/examples/cl_convolution.cpp
+++ /dev/null

@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL /* Needed by Utils.cpp to handle OpenCL exceptions properly */
-#error "This example needs to be built with -DARM_COMPUTE_CL"
-#endif /* ARM_COMPUTE_CL */
-
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/functions/CLConvolution.h"
-#include "utils/ImageLoader.h"
-#include "utils/Utils.h"
-
-using namespace arm_compute;
-using namespace utils;
-
-/** Gaussian 3x3 matrix
- */
-const std::array<int16_t, 9> gaussian3x3 =
-{
-    1, 2, 1,
-    2, 4, 2,
-    1, 2, 1
-};
-
-/** Gaussian 5x5 matrix
- */
-const std::array<int16_t, 25> gaussian5x5 =
-{
-    1, 4, 6, 4, 1,
-    4, 16, 24, 16, 4,
-    6, 24, 36, 24, 6,
-    4, 16, 24, 16, 4,
-    1, 4, 6, 4, 1
-};
-
-class CLConvolutionExample : public Example
-{
-public:
-    bool do_setup(int argc, char **argv) override
-    {
-        PPMLoader ppm;
-
-        CLScheduler::get().default_init();
-
-        if(argc < 2)
-        {
-            // Print help
-            std::cout << "Usage: ./build/cl_convolution [input_image.ppm]\n\n";
-            std::cout << "No input_image provided, creating a dummy 640x480 image\n";
-            // Create an empty grayscale 640x480 image
-            src.allocator()->init(TensorInfo(640, 480, Format::U8));
-        }
-        else
-        {
-            ppm.open(argv[1]);
-            ppm.init_image(src, Format::U8);
-        }
-
-        // Configure the temporary and destination images
-        tmp.allocator()->init(*src.info());
-        dst.allocator()->init(*src.info());
-
-        // Apply a Gaussian 3x3 filter to the source image followed by a Gaussian 5x5:
-        conv3x3.configure(&src, &tmp, gaussian3x3.data(), 0 /* Let arm_compute calculate the scale */, BorderMode::UNDEFINED);
-        conv5x5.configure(&tmp, &dst, gaussian5x5.data(), 0 /* Let arm_compute calculate the scale */, BorderMode::UNDEFINED);
-
-        // Allocate all the images
-        src.allocator()->allocate();
-        tmp.allocator()->allocate();
-        dst.allocator()->allocate();
-        // Fill the input image with the content of the PPM image if a filename was provided:
-        if(ppm.is_open())
-        {
-            ppm.fill_image(src);
-            output_filename = std::string(argv[1]) + "_out.ppm";
-        }
-
-        return true;
-    }
-    void do_run() override
-    {
-        // Execute the functions:
-        conv3x3.run();
-        conv5x5.run();
-
-        // Make sure all the OpenCL jobs are done executing:
-        CLScheduler::get().sync();
-    }
-    void do_teardown() override
-    {
-        // Save the result to file:
-        if(!output_filename.empty())
-        {
-            save_to_ppm(dst, output_filename); // save_to_ppm maps and unmaps the image to store as PPM
-        }
-    }
-
-private:
-    CLImage          src{};
-    CLImage          tmp{};
-    CLImage          dst{};
-    CLConvolution3x3 conv3x3{};
-    CLConvolution5x5 conv5x5{};
-    std::string      output_filename{};
-};
-
-/** Main program for convolution test
- *
- * @param[in] argc Number of arguments
- * @param[in] argv Arguments ( [optional] Path to PPM image to process )
- */
-int main(int argc, char **argv)
-{
-    return utils::run_example<CLConvolutionExample>(argc, argv);
-}

diff --git a/examples/cl_events.cpp b/examples/cl_events.cpp
deleted file mode 100644
index 27c063c..0000000
--- a/examples/cl_events.cpp
+++ /dev/null

@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL /* Needed by Utils.cpp to handle OpenCL exceptions properly */
-#error "This example needs to be built with -DARM_COMPUTE_CL"
-#endif /* ARM_COMPUTE_CL */
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
-#include "arm_compute/runtime/CL/functions/CLMedian3x3.h"
-#include "arm_compute/runtime/CL/functions/CLScale.h"
-#include "utils/ImageLoader.h"
-#include "utils/Utils.h"
-
-using namespace arm_compute;
-using namespace utils;
-
-class CLEventsExample : public Example
-{
-public:
-    bool do_setup(int argc, char **argv) override
-    {
-        /** [OpenCL events] **/
-        PPMLoader     ppm;
-        constexpr int scale_factor = 2;
-
-        CLScheduler::get().default_init();
-
-        if(argc < 2)
-        {
-            // Print help
-            std::cout << "Usage: ./build/cl_events [input_image.ppm]\n\n";
-            std::cout << "No input_image provided, creating a dummy 640x480 image\n";
-            // Create an empty grayscale 640x480 image
-            src.allocator()->init(TensorInfo(640, 480, Format::U8));
-        }
-        else
-        {
-            ppm.open(argv[1]);
-            ppm.init_image(src, Format::U8);
-        }
-
-        TensorInfo dst_info(src.info()->dimension(0) / scale_factor, src.info()->dimension(1) / scale_factor, Format::U8);
-
-        // Configure the temporary and destination images
-        dst.allocator()->init(dst_info);
-        tmp_scale_median.allocator()->init(dst_info);
-        tmp_median_gauss.allocator()->init(dst_info);
-
-        //Configure the functions:
-        scale.configure(&src, &tmp_scale_median, ScaleKernelInfo{ InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::REPLICATE });
-        median.configure(&tmp_scale_median, &tmp_median_gauss, BorderMode::REPLICATE);
-        gauss.configure(&tmp_median_gauss, &dst, BorderMode::REPLICATE);
-
-        // Allocate all the images
-        src.allocator()->allocate();
-        dst.allocator()->allocate();
-        tmp_scale_median.allocator()->allocate();
-        tmp_median_gauss.allocator()->allocate();
-
-        // Fill the input image with the content of the PPM image if a filename was provided:
-        if(ppm.is_open())
-        {
-            ppm.fill_image(src);
-            output_filename = std::string(argv[1]) + "_out.ppm";
-        }
-        /** [OpenCL events] **/
-
-        return true;
-    }
-    void do_run() override
-    {
-        // Enqueue and flush the scale OpenCL kernel:
-        scale.run();
-        // Create a synchronisation event between scale and median:
-        cl::Event scale_event = CLScheduler::get().enqueue_sync_event();
-        // Enqueue and flush the median OpenCL kernel:
-        median.run();
-        // Enqueue and flush the Gaussian OpenCL kernel:
-        gauss.run();
-
-        //Make sure all the OpenCL jobs are done executing:
-        scale_event.wait();        // Block until Scale is done executing (Median3x3 and Gaussian5x5 might still be running)
-        CLScheduler::get().sync(); // Block until Gaussian5x5 is done executing
-    }
-    void do_teardown() override
-    {
-        // Save the result to file:
-        if(!output_filename.empty())
-        {
-            save_to_ppm(dst, output_filename); // save_to_ppm maps and unmaps the image to store as PPM
-        }
-    }
-
-private:
-    CLImage       src{}, tmp_scale_median{}, tmp_median_gauss{}, dst{};
-    CLScale       scale{};
-    CLMedian3x3   median{};
-    CLGaussian5x5 gauss{};
-    std::string   output_filename{};
-};
-
-/** Main program for convolution test
- *
- * @param[in] argc Number of arguments
- * @param[in] argv Arguments ( [optional] Path to PPM image to process )
- */
-int main(int argc, char **argv)
-{
-    return utils::run_example<CLEventsExample>(argc, argv);
-}

diff --git a/examples/neon_convolution.cpp b/examples/neon_convolution.cpp
deleted file mode 100644
index 0b33c76..0000000
--- a/examples/neon_convolution.cpp
+++ /dev/null

@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2016-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/NEFunctions.h"
-
-#include "arm_compute/core/Types.h"
-#include "utils/ImageLoader.h"
-#include "utils/Utils.h"
-
-using namespace arm_compute;
-using namespace utils;
-
-/** Gaussian 3x3 matrix
- */
-const std::array<int16_t, 9> gaussian3x3 =
-{
-    1, 2, 1,
-    2, 4, 2,
-    1, 2, 1
-};
-
-/** Gaussian 5x5 matrix
- */
-const std::array<int16_t, 25> gaussian5x5 =
-{
-    1, 4, 6, 4, 1,
-    4, 16, 24, 16, 4,
-    6, 24, 36, 24, 6,
-    4, 16, 24, 16, 4,
-    1, 4, 6, 4, 1
-};
-
-class NEONConvolutionExample : public Example
-{
-public:
-    bool do_setup(int argc, char **argv) override
-    {
-        /** [Accurate padding] **/
-        PPMLoader ppm;
-
-        if(argc < 2)
-        {
-            // Print help
-            std::cout << "Usage: ./build/neon_convolution [input_image.ppm]\n\n";
-            std::cout << "No input_image provided, creating a dummy 640x480 image\n";
-            // Initialize just the dimensions and format of your buffers:
-            src.allocator()->init(TensorInfo(640, 480, Format::U8));
-        }
-        else
-        {
-            ppm.open(argv[1]);
-            // Initialize just the dimensions and format of your buffers:
-            ppm.init_image(src, Format::U8);
-        }
-
-        // Initialize just the dimensions and format of the temporary and destination images:
-        tmp.allocator()->init(*src.info());
-        dst.allocator()->init(*src.info());
-
-        // Apply a Gaussian 3x3 filter to the source image followed by a Gaussian 5x5:
-        // The function will automatically update the padding information inside input and output to match its requirements
-        conv3x3.configure(&src, &tmp, gaussian3x3.data(), 0 /* Let arm_compute calculate the scale */, BorderMode::UNDEFINED);
-        conv5x5.configure(&tmp, &dst, gaussian5x5.data(), 0 /* Let arm_compute calculate the scale */, BorderMode::UNDEFINED);
-
-        // Now that the padding requirements are known we can allocate the images:
-        src.allocator()->allocate();
-        tmp.allocator()->allocate();
-        dst.allocator()->allocate();
-
-        // Fill the input image with the content of the PPM image if a filename was provided:
-        if(ppm.is_open())
-        {
-            ppm.fill_image(src);
-            output_filename = std::string(argv[1]) + "_out.ppm";
-        }
-        /** [Accurate padding] **/
-
-        return true;
-    }
-    void do_run() override
-    {
-        //Execute the functions:
-        conv3x3.run();
-        conv5x5.run();
-    }
-    void do_teardown() override
-    {
-        // Save the result to file:
-        if(!output_filename.empty())
-        {
-            save_to_ppm(dst, output_filename); // save_to_ppm maps and unmaps the image to store as PPM
-        }
-    }
-
-private:
-    Image            src{}, tmp{}, dst{};
-    NEConvolution3x3 conv3x3{};
-    NEConvolution5x5 conv5x5{};
-    std::string      output_filename{};
-};
-
-/** Main program for convolution test
- *
- * @param[in] argc Number of arguments
- * @param[in] argv Arguments ( [optional] Path to PPM image to process )
- */
-int main(int argc, char **argv)
-{
-    return utils::run_example<NEONConvolutionExample>(argc, argv);
-}

diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 75f76ea..14d3a2c 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp

@@ -177,10 +177,6 @@
 using namespace arm_compute;
 const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
 {
-    { "absdiff", "absdiff.cl" },
-    { "accumulate", "accumulate.cl" },
-    { "accumulate_squared", "accumulate.cl" },
-    { "accumulate_weighted", "accumulate.cl" },
     { "activation_layer", "activation_layer.cl" },
     { "activation_layer_quant", "activation_layer_quant.cl" },
     { "activation_layer_quant_f32", "activation_layer_quant.cl" },
@@ -200,21 +196,8 @@
     { "bitwise_not", "bitwise_op.cl" },
     { "bounding_box_transform", "bounding_box_transform.cl" },
     { "bounding_box_transform_quantized", "bounding_box_transform_quantized.cl" },
-    { "channel_combine_NV", "channel_combine.cl" },
-    { "channel_combine_RGB888", "channel_combine.cl" },
-    { "channel_combine_RGBA8888", "channel_combine.cl" },
-    { "channel_combine_UYVY422", "channel_combine.cl" },
-    { "channel_combine_YUYV422", "channel_combine.cl" },
     { "channel_shuffle_nchw", "channel_shuffle.cl" },
     { "channel_shuffle_nhwc", "channel_shuffle.cl" },
-    { "channel_extract_NV12", "channel_extract.cl" },
-    { "channel_extract_NV21", "channel_extract.cl" },
-    { "channel_extract_RGB888", "channel_extract.cl" },
-    { "channel_extract_RGBA8888", "channel_extract.cl" },
-    { "channel_extract_UYVY422", "channel_extract.cl" },
-    { "channel_extract_YUYV422", "channel_extract.cl" },
-    { "combine_gradients_L1", "canny.cl" },
-    { "combine_gradients_L2", "canny.cl" },
     { "compare_equal", "comparisons.cl" },
     { "compare_equal_quantized", "comparisons.cl" },
     { "compare_notequal", "comparisons.cl" },
@@ -232,25 +215,11 @@
     { "concatenate_height", "concatenate.cl" },
     { "concatenate_width_x2", "concatenate.cl" },
     { "concatenate_width_x4", "concatenate.cl" },
-    { "convolution_rectangle", "convolution_rectangle.cl" },
     { "col2im", "col2im.cl" },
     { "convert_depth_down", "depth_convert.cl" },
     { "convert_depth_up", "depth_convert.cl" },
     { "convert_fc_weights", "convert_fc_weights.cl" },
-    { "convolution3x3_static", "convolution3x3.cl" },
-    { "convolution5x5_static", "convolution5x5.cl" },
-    { "convolution7x7_static", "convolution7x7.cl" },
-    { "convolution9x9_static", "convolution9x9.cl" },
-    { "convolution_separable1x5_static", "convolution5x5.cl" },
-    { "convolution_separable5x1_static", "convolution5x5.cl" },
-    { "convolution_separable1x7_static", "convolution7x7.cl" },
-    { "convolution_separable7x1_static", "convolution7x7.cl" },
-    { "convolution_separable1x9_static", "convolution9x9.cl" },
-    { "convolution_separable9x1_static", "convolution9x9.cl" },
     { "copy_tensor", "copy_tensor.cl" },
-    { "copy_plane", "channel_extract.cl" },
-    { "copy_planes_3p", "channel_combine.cl" },
-    { "copy_to_keypoint", "fast_corners.cl" },
     { "crop_tensor", "crop_tensor.cl" },
     { "deconvolution_reshape", "deconvolution_layer.cl" },
     { "deconvolution_upsample", "deconvolution_layer.cl" },
@@ -275,8 +244,6 @@
     { "dequantization_layer", "dequantization_layer.cl" },
     { "dequantization_layer_per_channel_nhwc", "dequantization_layer.cl" },
     { "dequantization_layer_per_channel_nchw", "dequantization_layer.cl" },
-    { "derivative", "derivative.cl" },
-    { "dilate", "dilate.cl" },
     { "direct_convolution_nhwc", "direct_convolution.cl" },
     { "direct_convolution1x1", "direct_convolution1x1.cl" },
     { "direct_convolution1x1_f32_bifrost", "direct_convolution1x1.cl" },
@@ -303,8 +270,6 @@
     { "elementwise_operation_SQUARED_DIFF_quantized", "elementwise_operation_quantized.cl" },
     { "elementwise_operation_PRELU_quantized", "elementwise_operation_quantized.cl" },
     { "elementwise_unary", "elementwise_unary.cl" },
-    { "erode", "erode.cl" },
-    { "fast_corners", "fast_corners.cl" },
     { "fft_digit_reverse_axis_0", "fft_digit_reverse.cl" },
     { "fft_digit_reverse_axis_1", "fft_digit_reverse.cl" },
     { "fft_radix_2_first_stage_axis_0", "fft.cl" },
@@ -334,12 +299,9 @@
     { "fft_scale_conj", "fft_scale.cl" },
     { "fill_image_borders_constant", "fill_border.cl" },
     { "fill_image_borders_replicate", "fill_border.cl" },
-    { "finalize", "optical_flow_pyramid_lk.cl" },
     { "floor_layer", "floor.cl" },
     { "fuse_batchnormalization_layer", "batchnormalization_layer.cl" },
     { "gather", "gather.cl" },
-    { "gaussian1x5_sub_x", "gaussian_pyramid.cl" },
-    { "gaussian5x1_sub_y", "gaussian_pyramid.cl" },
     { "gemm_ma_f16", "gemm.cl" },
     { "gemm_ma_f32", "gemm.cl" },
     { "gemm_mv", "gemv.cl" },
@@ -384,17 +346,6 @@
     { "gemmlowp_output_stage_quantize_down_float", "gemmlowp.cl" },
     { "generate_proposals_compute_all_anchors", "generate_proposals.cl" },
     { "generate_proposals_compute_all_anchors_quantized", "generate_proposals_quantized.cl" },
-    { "harris_score_3x3", "harris_corners.cl" },
-    { "harris_score_5x5", "harris_corners.cl" },
-    { "harris_score_7x7", "harris_corners.cl" },
-    { "hist_border_kernel", "histogram.cl" },
-    { "hist_border_kernel_fixed", "histogram.cl" },
-    { "hist_local_kernel", "histogram.cl" },
-    { "hist_local_kernel_fixed", "histogram.cl" },
-    { "hog_block_normalization", "hog.cl" },
-    { "hog_detector", "hog.cl" },
-    { "hog_orientation_binning", "hog.cl" },
-    { "hysteresis", "canny.cl" },
     { "im2col1x1_stridex1_nchw", "im2col.cl" },
     { "im2col3x3_nchw", "im2col.cl" },
     { "im2col5x5_nchw", "im2col.cl" },
@@ -404,36 +355,14 @@
     { "im2col3x3_nhwc", "im2col.cl" },
     { "im2col9x9_nhwc", "im2col.cl" },
     { "im2col_generic_nhwc", "im2col.cl" },
-    { "init_level", "optical_flow_pyramid_lk.cl" },
-    { "init_level_max", "optical_flow_pyramid_lk.cl" },
-    { "init_level_max_initial_estimate", "optical_flow_pyramid_lk.cl" },
     { "instance_normalization", "instance_normalization.cl" },
-    { "integral_horizontal", "integral_image.cl" },
-    { "integral_vertical", "integral_image.cl" },
-    { "IYUV_to_NV12_bt709", "color_convert.cl" },
-    { "IYUV_to_RGB888_bt709", "color_convert.cl" },
-    { "IYUV_to_RGBA8888_bt709", "color_convert.cl" },
-    { "IYUV_to_YUV444_bt709", "color_convert.cl" },
     { "l2_normalize_x", "l2_normalize.cl" },
     { "l2_normalize_y", "l2_normalize.cl" },
     { "l2_normalize_z", "l2_normalize.cl" },
-    { "lktracker_stage0", "optical_flow_pyramid_lk.cl" },
-    { "lktracker_stage1", "optical_flow_pyramid_lk.cl" },
-    { "magnitude_phase", "magnitude_phase.cl" },
     { "max_unpooling_layer_2", "unpooling_layer.cl" },
-    { "mean_stddev_accumulate", "mean_stddev.cl" },
     { "mean_stddev_normalization", "mean_stddev_normalization.cl" },
     { "memset", "memset.cl" },
-    { "minmax", "minmaxloc.cl" },
-    { "minmax_border", "minmaxloc.cl" },
     { "minmax_layer", "minmax_layer.cl" },
-    { "minmaxloc", "minmaxloc.cl" },
-    { "non_linear_filter_box3x3", "non_linear_filter3x3.cl" },
-    { "non_linear_filter_cross3x3", "non_linear_filter3x3.cl" },
-    { "non_linear_filter_disk3x3", "non_linear_filter3x3.cl" },
-    { "non_linear_filter_box5x5", "non_linear_filter5x5.cl" },
-    { "non_linear_filter_cross5x5", "non_linear_filter5x5.cl" },
-    { "non_linear_filter_disk5x5", "non_linear_filter5x5.cl" },
     { "non_max_suppression", "nonmax.cl" },
     { "normalization_layer_cross_map", "normalization_layer.cl" },
     { "normalization_layer_in_map_nchw", "normalization_layer.cl" },
@@ -442,14 +371,6 @@
     { "normalize_planar_yuv_layer_nhwc", "normalize_planar_yuv_layer.cl" },
     { "normalize_planar_yuv_layer_q8_nchw", "normalize_planar_yuv_layer_quantized.cl" },
     { "normalize_planar_yuv_layer_q8_nhwc", "normalize_planar_yuv_layer_quantized.cl" },
-    { "NV12_to_IYUV_bt709", "color_convert.cl" },
-    { "NV12_to_RGB888_bt709", "color_convert.cl" },
-    { "NV12_to_RGBA8888_bt709", "color_convert.cl" },
-    { "NV12_to_YUV444_bt709", "color_convert.cl" },
-    { "NV21_to_IYUV_bt709", "color_convert.cl" },
-    { "NV21_to_RGB888_bt709", "color_convert.cl" },
-    { "NV21_to_RGBA8888_bt709", "color_convert.cl" },
-    { "NV21_to_YUV444_bt709", "color_convert.cl" },
     { "pad_layer_constant", "pad_layer.cl" },
     { "pad_layer_symmetric_reflect", "pad_layer.cl" },
     { "permute", "permute.cl" },
@@ -485,15 +406,6 @@
     { "reshape_layer", "reshape_layer.cl" },
     { "reshape_to_columns", "convolution_layer.cl" },
     { "reverse", "reverse.cl" },
-    { "RGB888_to_IYUV_bt709", "color_convert.cl" },
-    { "RGB888_to_NV12_bt709", "color_convert.cl" },
-    { "RGB888_to_RGBA8888_bt709", "color_convert.cl" },
-    { "RGB888_to_U8_bt709", "color_convert.cl" },
-    { "RGB888_to_YUV444_bt709", "color_convert.cl" },
-    { "RGBA8888_to_IYUV_bt709", "color_convert.cl" },
-    { "RGBA8888_to_NV12_bt709", "color_convert.cl" },
-    { "RGBA8888_to_RGB888_bt709", "color_convert.cl" },
-    { "RGBA8888_to_YUV444_bt709", "color_convert.cl" },
     { "roi_align_layer", "roi_align_layer.cl" },
     { "roi_align_layer_quantized", "roi_align_layer_quantized.cl" },
     { "roi_pooling_layer", "roi_pooling_layer.cl" },
@@ -503,15 +415,9 @@
     { "scale_bilinear_nhwc", "scale.cl" },
     { "scale_bilinear_quantized_nchw", "scale_quantized.cl" },
     { "scale_bilinear_quantized_nhwc", "scale_quantized.cl" },
-    { "scharr3x3", "scharr_filter.cl" },
     { "select_same_rank", "select.cl" },
     { "select_different_rank_2", "select.cl" },
     { "select_different_rank_n", "select.cl" },
-    { "sobel3x3", "sobel_filter.cl" },
-    { "sobel_separable5x1", "sobel_filter.cl" },
-    { "sobel_separable1x5", "sobel_filter.cl" },
-    { "sobel_separable7x1", "sobel_filter.cl" },
-    { "sobel_separable1x7", "sobel_filter.cl" },
     { "softmax_layer_norm", "softmax_layer.cl" },
     { "softmax_layer_norm_quantized", "softmax_layer_quantized.cl" },
     { "softmax_layer_max_shift_exp_sum_quantized_serial", "softmax_layer_quantized.cl" },
@@ -526,23 +432,10 @@
     { "softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl" },
     { "stack_layer", "stack_layer.cl" },
     { "strided_slice", "slice_ops.cl" },
-    { "suppress_non_maximum", "canny.cl" },
-    { "tablelookup_U8", "tablelookup.cl" },
-    { "tablelookup_S16", "tablelookup.cl" },
-    { "threshold_binary", "threshold.cl" },
-    { "threshold_range", "threshold.cl" },
     { "tile", "tile.cl" },
     { "transpose", "transpose.cl" },
-    { "UYVY422_to_IYUV_bt709", "color_convert.cl" },
-    { "UYVY422_to_NV12_bt709", "color_convert.cl" },
-    { "UYVY422_to_RGB888_bt709", "color_convert.cl" },
-    { "UYVY422_to_RGBA8888_bt709", "color_convert.cl" },
     { "upsample_layer_nchw", "upsample_layer.cl" },
     { "upsample_layer_nhwc", "upsample_layer.cl" },
-    { "warp_affine_nearest_neighbour", "warp_affine.cl" },
-    { "warp_affine_bilinear", "warp_affine.cl" },
-    { "warp_perspective_nearest_neighbour", "warp_perspective.cl" },
-    { "warp_perspective_bilinear", "warp_perspective.cl" },
     { "winograd_filter_transform_2x2_3x3_nchw", "winograd_filter_transform.cl" },
     { "winograd_filter_transform_2x1_3x1_nchw", "winograd_filter_transform.cl" },
     { "winograd_filter_transform_1x2_1x3_nchw", "winograd_filter_transform.cl" },
@@ -602,24 +495,12 @@
     { "winograd_output_transform_1x2_1x7_nhwc", "winograd_output_transform.cl" },
     { "yolo_layer_nchw", "yolo_layer.cl" },
     { "yolo_layer_nhwc", "yolo_layer.cl" },
-    { "YUYV422_to_IYUV_bt709", "color_convert.cl" },
-    { "YUYV422_to_NV12_bt709", "color_convert.cl" },
-    { "YUYV422_to_RGB888_bt709", "color_convert.cl" },
-    { "YUYV422_to_RGBA8888_bt709", "color_convert.cl" },
 };
 
 const std::map<std::string, std::string> CLKernelLibrary::_program_source_map =
 {
 #ifdef EMBEDDED_KERNELS
     {
-        "absdiff.cl",
-#include "./cl_kernels/absdiff.clembed"
-    },
-    {
-        "accumulate.cl",
-#include "./cl_kernels/accumulate.clembed"
-    },
-    {
         "activation_layer.cl",
 #include "./cl_kernels/activation_layer.clembed"
     },
@@ -648,18 +529,6 @@
 #include "./cl_kernels/bounding_box_transform_quantized.clembed"
     },
     {
-        "canny.cl",
-#include "./cl_kernels/canny.clembed"
-    },
-    {
-        "channel_combine.cl",
-#include "./cl_kernels/channel_combine.clembed"
-    },
-    {
-        "channel_extract.cl",
-#include "./cl_kernels/channel_extract.clembed"
-    },
-    {
         "channel_shuffle.cl",
 #include "./cl_kernels/channel_shuffle.clembed"
     },
@@ -676,38 +545,14 @@
 #include "./cl_kernels/concatenate.clembed"
     },
     {
-        "color_convert.cl",
-#include "./cl_kernels/color_convert.clembed"
-    },
-    {
         "convert_fc_weights.cl",
 #include "./cl_kernels/convert_fc_weights.clembed"
-    },
-    {
-        "convolution3x3.cl",
-#include "./cl_kernels/convolution3x3.clembed"
-    },
-    {
-        "convolution5x5.cl",
-#include "./cl_kernels/convolution5x5.clembed"
-    },
-    {
-        "convolution7x7.cl",
-#include "./cl_kernels/convolution7x7.clembed"
-    },
-    {
-        "convolution9x9.cl",
-#include "./cl_kernels/convolution9x9.clembed"
-    },
+    },    
     {
         "convolution_layer.cl",
 #include "./cl_kernels/convolution_layer.clembed"
     },
     {
-        "convolution_rectangle.cl",
-#include "./cl_kernels/convolution_rectangle.clembed"
-    },
-    {
         "copy_tensor.cl",
 #include "./cl_kernels/copy_tensor.clembed"
     },
@@ -744,14 +589,6 @@
 #include "./cl_kernels/dequantization_layer.clembed"
     },
     {
-        "derivative.cl",
-#include "./cl_kernels/derivative.clembed"
-    },
-    {
-        "dilate.cl",
-#include "./cl_kernels/dilate.clembed"
-    },
-    {
         "direct_convolution1x1.cl",
 #include "./cl_kernels/direct_convolution1x1.clembed"
     },
@@ -784,14 +621,6 @@
 #include "./cl_kernels/elementwise_unary.clembed"
     },
     {
-        "erode.cl",
-#include "./cl_kernels/erode.clembed"
-    },
-    {
-        "fast_corners.cl",
-#include "./cl_kernels/fast_corners.clembed"
-    },
-    {
         "fft.cl",
 #include "./cl_kernels/fft.clembed"
     },
@@ -816,10 +645,6 @@
 #include "./cl_kernels/gather.clembed"
     },
     {
-        "gaussian_pyramid.cl",
-#include "./cl_kernels/gaussian_pyramid.clembed"
-    },
-    {
         "gemm.cl",
 #include "./cl_kernels/gemm.clembed"
     },
@@ -844,10 +669,6 @@
 #include "./cl_kernels/generate_proposals_quantized.clembed"
     },
     {
-        "harris_corners.cl",
-#include "./cl_kernels/harris_corners.clembed"
-    },
-    {
         "helpers.h",
 #include "./cl_kernels/helpers.hembed"
     },
@@ -856,14 +677,6 @@
 #include "./cl_kernels/helpers_asymm.hembed"
     },
     {
-        "histogram.cl",
-#include "./cl_kernels/histogram.clembed"
-    },
-    {
-        "hog.cl",
-#include "./cl_kernels/hog.clembed"
-    },
-    {
         "im2col.cl",
 #include "./cl_kernels/im2col.clembed"
     },
@@ -872,22 +685,10 @@
 #include "./cl_kernels/instance_normalization.clembed"
     },
     {
-        "integral_image.cl",
-#include "./cl_kernels/integral_image.clembed"
-    },
-    {
         "l2_normalize.cl",
 #include "./cl_kernels/l2_normalize.clembed"
     },
     {
-        "magnitude_phase.cl",
-#include "./cl_kernels/magnitude_phase.clembed"
-    },
-    {
-        "mean_stddev.cl",
-#include "./cl_kernels/mean_stddev.clembed"
-    },
-    {
         "mean_stddev_normalization.cl",
 #include "./cl_kernels/mean_stddev_normalization.clembed"
     },
@@ -896,26 +697,10 @@
 #include "./cl_kernels/memset.clembed"
     },
     {
-        "minmaxloc.cl",
-#include "./cl_kernels/minmaxloc.clembed"
-    },
-    {
         "minmax_layer.cl",
 #include "./cl_kernels/minmax_layer.clembed"
     },
     {
-        "non_linear_filter3x3.cl",
-#include "./cl_kernels/non_linear_filter3x3.clembed"
-    },
-    {
-        "non_linear_filter5x5.cl",
-#include "./cl_kernels/non_linear_filter5x5.clembed"
-    },
-    {
-        "non_linear_filter_helpers.h",
-#include "./cl_kernels/non_linear_filter_helpers.hembed"
-    },
-    {
         "nonmax.cl",
 #include "./cl_kernels/nonmax.clembed"
     },
@@ -936,10 +721,6 @@
 #include "./cl_kernels/batchnormalization_layer.clembed"
     },
     {
-        "optical_flow_pyramid_lk.cl",
-#include "./cl_kernels/optical_flow_pyramid_lk.clembed"
-    },
-    {
         "pad_layer.cl",
 #include "./cl_kernels/pad_layer.clembed"
     },
@@ -1020,18 +801,10 @@
 #include "./cl_kernels/scale_quantized.clembed"
     },
     {
-        "scharr_filter.cl",
-#include "./cl_kernels/scharr_filter.clembed"
-    },
-    {
         "select.cl",
 #include "./cl_kernels/select.clembed"
     },
     {
-        "sobel_filter.cl",
-#include "./cl_kernels/sobel_filter.clembed"
-    },
-    {
         "softmax_layer.cl",
 #include "./cl_kernels/softmax_layer.clembed"
     },
@@ -1056,14 +829,6 @@
 #include "./cl_kernels/stack_layer.clembed"
     },
     {
-        "tablelookup.cl",
-#include "./cl_kernels/tablelookup.clembed"
-    },
-    {
-        "threshold.cl",
-#include "./cl_kernels/threshold.clembed"
-    },
-    {
         "tile.cl",
 #include "./cl_kernels/tile.clembed"
     },
@@ -1080,18 +845,6 @@
 #include "./cl_kernels/unpooling_layer.clembed"
     },
     {
-        "warp_affine.cl",
-#include "./cl_kernels/warp_affine.clembed"
-    },
-    {
-        "warp_helpers.h",
-#include "./cl_kernels/warp_helpers.hembed"
-    },
-    {
-        "warp_perspective.cl",
-#include "./cl_kernels/warp_perspective.clembed"
-    },
-    {
         "winograd_filter_transform.cl",
 #include "./cl_kernels/winograd_filter_transform.clembed"
     },

diff --git a/src/core/CL/CLKernels.h b/src/core/CL/CLKernels.h
index 7383dce..22c9cd9 100644
--- a/src/core/CL/CLKernels.h
+++ b/src/core/CL/CLKernels.h

@@ -25,23 +25,15 @@
 #define ARM_COMPUTE_CLKERNELS_H
 
 /* Header regrouping all the CL kernels */
-#include "src/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
-#include "src/core/CL/kernels/CLAccumulateKernel.h"
 #include "src/core/CL/kernels/CLArgMinMaxLayerKernel.h"
 #include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
 #include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
 #include "src/core/CL/kernels/CLBitwiseKernel.h"
 #include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
-#include "src/core/CL/kernels/CLBox3x3Kernel.h"
-#include "src/core/CL/kernels/CLCannyEdgeKernel.h"
-#include "src/core/CL/kernels/CLChannelCombineKernel.h"
-#include "src/core/CL/kernels/CLChannelExtractKernel.h"
 #include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h"
 #include "src/core/CL/kernels/CLCol2ImKernel.h"
-#include "src/core/CL/kernels/CLColorConvertKernel.h"
 #include "src/core/CL/kernels/CLComparisonKernel.h"
 #include "src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h"
-#include "src/core/CL/kernels/CLConvolutionKernel.h"
 #include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
 #include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
 #include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
@@ -51,14 +43,10 @@
 #include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
 #include "src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h"
 #include "src/core/CL/kernels/CLDequantizationLayerKernel.h"
-#include "src/core/CL/kernels/CLDerivativeKernel.h"
-#include "src/core/CL/kernels/CLDilateKernel.h"
 #include "src/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
-#include "src/core/CL/kernels/CLErodeKernel.h"
 #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
 #include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
 #include "src/core/CL/kernels/CLFFTScaleKernel.h"
-#include "src/core/CL/kernels/CLFastCornersKernel.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
 #include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
@@ -77,28 +65,14 @@
 #include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
 #include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
 #include "src/core/CL/kernels/CLGatherKernel.h"
-#include "src/core/CL/kernels/CLGaussian3x3Kernel.h"
-#include "src/core/CL/kernels/CLGaussian5x5Kernel.h"
-#include "src/core/CL/kernels/CLGaussianPyramidKernel.h"
 #include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
-#include "src/core/CL/kernels/CLHOGDescriptorKernel.h"
-#include "src/core/CL/kernels/CLHOGDetectorKernel.h"
-#include "src/core/CL/kernels/CLHarrisCornersKernel.h"
-#include "src/core/CL/kernels/CLHistogramKernel.h"
 #include "src/core/CL/kernels/CLIm2ColKernel.h"
 #include "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
-#include "src/core/CL/kernels/CLIntegralImageKernel.h"
 #include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h"
 #include "src/core/CL/kernels/CLLKTrackerKernel.h"
-#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
 #include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
-#include "src/core/CL/kernels/CLMeanStdDevKernel.h"
 #include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
-#include "src/core/CL/kernels/CLMedian3x3Kernel.h"
 #include "src/core/CL/kernels/CLMinMaxLayerKernel.h"
-#include "src/core/CL/kernels/CLMinMaxLocationKernel.h"
-#include "src/core/CL/kernels/CLNonLinearFilterKernel.h"
-#include "src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
 #include "src/core/CL/kernels/CLNormalizationLayerKernel.h"
 #include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
 #include "src/core/CL/kernels/CLPadLayerKernel.h"
@@ -114,22 +88,14 @@
 #include "src/core/CL/kernels/CLReorgLayerKernel.h"
 #include "src/core/CL/kernels/CLReverseKernel.h"
 #include "src/core/CL/kernels/CLScaleKernel.h"
-#include "src/core/CL/kernels/CLScharr3x3Kernel.h"
 #include "src/core/CL/kernels/CLSelectKernel.h"
-#include "src/core/CL/kernels/CLSobel3x3Kernel.h"
-#include "src/core/CL/kernels/CLSobel5x5Kernel.h"
-#include "src/core/CL/kernels/CLSobel7x7Kernel.h"
 #include "src/core/CL/kernels/CLSoftmaxLayerKernel.h"
 #include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
 #include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
 #include "src/core/CL/kernels/CLStackLayerKernel.h"
 #include "src/core/CL/kernels/CLStridedSliceKernel.h"
-#include "src/core/CL/kernels/CLTableLookupKernel.h"
-#include "src/core/CL/kernels/CLThresholdKernel.h"
 #include "src/core/CL/kernels/CLTileKernel.h"
 #include "src/core/CL/kernels/CLTransposeKernel.h"
-#include "src/core/CL/kernels/CLWarpAffineKernel.h"
-#include "src/core/CL/kernels/CLWarpPerspectiveKernel.h"
 #include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
 #include "src/core/CL/kernels/CLWinogradFilterTransformKernel.h"
 #include "src/core/CL/kernels/CLWinogradInputTransformKernel.h"

diff --git a/src/core/CL/cl_kernels/absdiff.cl b/src/core/CL/cl_kernels/absdiff.cl
deleted file mode 100644
index a09caf5..0000000
--- a/src/core/CL/cl_kernels/absdiff.cl
+++ /dev/null

@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Calculate the absolute difference of two input images.
- *
- * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:\n
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
- *
- * @param[in]  in1_ptr                           Pointer to the first source image. Supported data types: U8, S16
- * @param[in]  in1_stride_x                      Stride of the first source image in X dimension (in bytes)
- * @param[in]  in1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the first source image in Y dimension (in bytes)
- * @param[in]  in1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the first source image
- * @param[in]  in2_ptr                           Pointer to the second source image. Supported data types: U8, S16
- * @param[in]  in2_stride_x                      Stride of the second source image in X dimension (in bytes)
- * @param[in]  in2_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the second source image in Y dimension (in bytes)
- * @param[in]  in2_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the second source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void absdiff(
-    IMAGE_DECLARATION(in1),
-    IMAGE_DECLARATION(in2),
-    IMAGE_DECLARATION(out))
-{
-    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
-    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
-
-    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
-    in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
-    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
-    in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
-
-    vstore16(CONVERT_SAT(abs_diff(in_a, in_b), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
-}

diff --git a/src/core/CL/cl_kernels/accumulate.cl b/src/core/CL/cl_kernels/accumulate.cl
deleted file mode 100644
index 9e37830..0000000
--- a/src/core/CL/cl_kernels/accumulate.cl
+++ /dev/null

@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function accumulates an input image into output image.
- *
- * @param[in]  input_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] accu_ptr                            Pointer to the destination image. Supported data types: S16
- * @param[in]  accu_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  accu_step_x                         accu_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  accu_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  accu_step_y                         accu_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  accu_offset_first_element_in_bytes  The offset of the first element in the destination image
- */
-__kernel void accumulate(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(accu))
-{
-    // Get pixels pointer
-    Image input = CONVERT_TO_IMAGE_STRUCT(input);
-    Image accu  = CONVERT_TO_IMAGE_STRUCT(accu);
-
-    // Load data
-    uchar16 in_data   = vload16(0, input.ptr);
-    short16 accu_data = vload16(0, (__global short *)accu.ptr);
-
-    // Perform accumulation
-    short16 res = add_sat(convert_short16(in_data), accu_data);
-
-    // Store result
-    vstore16(res, 0, (__global short *)accu.ptr);
-}
-
-/** This function accumulates a weighted value from an input image to an output image.
- *
- * @param[in]  input_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] accu_ptr                            Pointer to the destination image. Supported data types: S16
- * @param[in]  accu_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  accu_step_x                         accu_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  accu_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  accu_step_y                         accu_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  accu_offset_first_element_in_bytes  The offset of the first element in the destination image
- * @param[in]  alpha                               The float scalar value with a value in the range of 0 to 1
- */
-__kernel void accumulate_weighted(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(accu),
-    const float alpha)
-{
-    // Get pixels pointer
-    Image input = CONVERT_TO_IMAGE_STRUCT(input);
-    Image accu  = CONVERT_TO_IMAGE_STRUCT(accu);
-
-    // Load data
-    const float16 in_data   = convert_float16(vload16(0, input.ptr));
-    const float16 accu_data = convert_float16(vload16(0, accu.ptr));
-
-    // Calculate weighted accumulation
-    const uchar16 res = convert_uchar16((1.0f - alpha) * accu_data + alpha * in_data);
-
-    // Store result
-    vstore16(res, 0, accu.ptr);
-}
-
-/** This function accumulates a squared value from an input image to an output image.
- *
- * @param[in]  input_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] accu_ptr                            Pointer to the destination image. Supported data types: S16
- * @param[in]  accu_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  accu_step_x                         accu_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  accu_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  accu_step_y                         accu_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  accu_offset_first_element_in_bytes  The offset of the first element in the destination image
- * @param[in]  shift                               The U32 scalar value with a value in the range of 0 to 15
- */
-__kernel void accumulate_squared(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(accu),
-    const uint shift)
-{
-    // Get pixels pointer
-    Image input = CONVERT_TO_IMAGE_STRUCT(input);
-    Image accu  = CONVERT_TO_IMAGE_STRUCT(accu);
-
-    // Load data
-    ushort16 in_data   = convert_ushort16(vload16(0, input.ptr));
-    uint16   accu_data = convert_uint16(vload16(0, (__global short *)accu.ptr));
-
-    // Calculate squared accumulation
-    short16 res = convert_short16_sat(accu_data + convert_uint16((in_data * in_data) >> shift));
-
-    // Store result
-    vstore16(res, 0, (__global short *)accu.ptr);
-}

diff --git a/src/core/CL/cl_kernels/canny.cl b/src/core/CL/cl_kernels/canny.cl
deleted file mode 100644
index bcff843..0000000
--- a/src/core/CL/cl_kernels/canny.cl
+++ /dev/null

@@ -1,454 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Calculate the magnitude and phase from horizontal and vertical result of sobel result.
- *
- * @note The calculation of gradient uses level 1 normalisation.
- * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
- *
- * @param[in]  src1_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
- * @param[in]  src1_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src1_step_x                         src1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src1_step_y                         src1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  src2_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
- * @param[in]  src2_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src2_step_x                         src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src2_step_y                         src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] grad_ptr                            Pointer to the gradient output. Supported data types: U16, U32
- * @param[in]  grad_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  grad_step_x                         grad_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  grad_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  grad_step_y                         grad_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  grad_offset_first_element_in_bytes  The offset of the first element of the output
- * @param[out] angle_ptr                           Pointer to the angle output. Supported data types: U8
- * @param[in]  angle_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  angle_step_x                        angle_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  angle_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  angle_step_y                        angle_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  angle_offset_first_element_in_bytes The offset of the first element of the output
- */
-__kernel void combine_gradients_L1(
-    IMAGE_DECLARATION(src1),
-    IMAGE_DECLARATION(src2),
-    IMAGE_DECLARATION(grad),
-    IMAGE_DECLARATION(angle))
-{
-    // Construct images
-    Image src1  = CONVERT_TO_IMAGE_STRUCT(src1);
-    Image src2  = CONVERT_TO_IMAGE_STRUCT(src2);
-    Image grad  = CONVERT_TO_IMAGE_STRUCT(grad);
-    Image angle = CONVERT_TO_IMAGE_STRUCT(angle);
-
-    // Load sobel horizontal and vertical values
-    VEC_DATA_TYPE(DATA_TYPE_IN, 4)
-    h = vload4(0, (__global DATA_TYPE_IN *)src1.ptr);
-    VEC_DATA_TYPE(DATA_TYPE_IN, 4)
-    v = vload4(0, (__global DATA_TYPE_IN *)src2.ptr);
-
-    /* Calculate the gradient, using level 1 normalisation method */
-    VEC_DATA_TYPE(DATA_TYPE_OUT, 4)
-    m = CONVERT_SAT((abs(h) + abs(v)), VEC_DATA_TYPE(DATA_TYPE_OUT, 4));
-
-    /* Calculate the angle */
-    float4 p = 180.0f * atan2pi(convert_float4(v), convert_float4(h));
-
-    /* Remap angle to range [0, 256) */
-    p = select(p, p + 180.0f, p < 0.0f);
-
-    /* Store results */
-    vstore4(m, 0, (__global DATA_TYPE_OUT *)grad.ptr);
-    vstore4(convert_uchar4_sat_rte(p), 0, angle.ptr);
-}
-
-/** Calculate the gradient and angle from horizontal and vertical result of sobel result.
- *
- * @note The calculation of gradient uses level 2 normalisation
- * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
- *
- * @param[in]  src1_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
- * @param[in]  src1_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src1_step_x                         src1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src1_step_y                         src1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  src2_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
- * @param[in]  src2_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src2_step_x                         src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src2_step_y                         src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] grad_ptr                            Pointer to the gradient output. Supported data types: U16, U32
- * @param[in]  grad_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  grad_step_x                         grad_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  grad_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  grad_step_y                         grad_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  grad_offset_first_element_in_bytes  The offset of the first element of the output
- * @param[out] angle_ptr                           Pointer to the angle output. Supported data types: U8
- * @param[in]  angle_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  angle_step_x                        angle_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  angle_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  angle_step_y                        angle_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  angle_offset_first_element_in_bytes The offset of the first element of the output
- */
-__kernel void combine_gradients_L2(
-    IMAGE_DECLARATION(src1),
-    IMAGE_DECLARATION(src2),
-    IMAGE_DECLARATION(grad),
-    IMAGE_DECLARATION(angle))
-{
-    // Construct images
-    Image src1  = CONVERT_TO_IMAGE_STRUCT(src1);
-    Image src2  = CONVERT_TO_IMAGE_STRUCT(src2);
-    Image grad  = CONVERT_TO_IMAGE_STRUCT(grad);
-    Image angle = CONVERT_TO_IMAGE_STRUCT(angle);
-
-    // Load sobel horizontal and vertical values
-    float4 h = convert_float4(vload4(0, (__global DATA_TYPE_IN *)src1.ptr));
-    float4 v = convert_float4(vload4(0, (__global DATA_TYPE_IN *)src2.ptr));
-
-    /* Calculate the gradient, using level 2 normalisation method */
-    float4 m = sqrt(h * h + v * v);
-
-    /* Calculate the angle */
-    float4 p = 180.0f * atan2pi(v, h);
-
-    /* Remap angle to range [0, 256) */
-    p = select(p, p + 180.0f, p < 0.0f);
-
-    /* Store results */
-    vstore4(CONVERT_SAT_ROUND(m, VEC_DATA_TYPE(DATA_TYPE_OUT, 4), rte), 0, (__global DATA_TYPE_OUT *)grad.ptr);
-    vstore4(convert_uchar4_sat_rte(p), 0, angle.ptr);
-}
-
-#define EDGE 255
-#define NO_EDGE 0
-
-/** Array that holds the relative coordinates offset for the neighbouring pixels.
- */
-__constant short4 neighbours_coords[] =
-{
-    { -1, 0, 1, 0 },  // 0
-    { -1, -1, 1, 1 }, // 45
-    { 0, -1, 0, 1 },  // 90
-    { 1, -1, -1, 1 }, // 135
-};
-
-/** Perform non maximum suppression.
- *
- * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
- *
- * @param[in]  grad_ptr                              Pointer to the gradient output. Supported data types: S16, S32
- * @param[in]  grad_stride_x                         Stride of the source image in X dimension (in bytes)
- * @param[in]  grad_step_x                           grad_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  grad_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  grad_step_y                           grad_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  grad_offset_first_element_in_bytes    The offset of the first element of the output
- * @param[in]  angle_ptr                             Pointer to the angle output. Supported data types: U8
- * @param[in]  angle_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  angle_step_x                          angle_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  angle_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  angle_step_y                          angle_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  angle_offset_first_element_in_bytes   TThe offset of the first element of the output
- * @param[out] non_max_ptr                           Pointer to the non maximum suppressed output. Supported data types: U16, U32
- * @param[in]  non_max_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  non_max_step_x                        non_max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  non_max_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  non_max_step_y                        non_max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  non_max_offset_first_element_in_bytes The offset of the first element of the output
- * @param[in]  lower_thr                             The low threshold
- */
-__kernel void suppress_non_maximum(
-    IMAGE_DECLARATION(grad),
-    IMAGE_DECLARATION(angle),
-    IMAGE_DECLARATION(non_max),
-    uint lower_thr)
-{
-    // Construct images
-    Image grad    = CONVERT_TO_IMAGE_STRUCT(grad);
-    Image angle   = CONVERT_TO_IMAGE_STRUCT(angle);
-    Image non_max = CONVERT_TO_IMAGE_STRUCT(non_max);
-
-    // Index
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-
-    // Get gradient and angle
-    DATA_TYPE_IN gradient = *((__global DATA_TYPE_IN *)grad.ptr);
-    uchar an              = *((__global uchar *)angle.ptr);
-
-    // Early return if not greater than lower threshold
-    if(gradient <= lower_thr)
-    {
-        return;
-    }
-
-    // Divide the whole round into 4 directions
-    DATA_TYPE_OUT q_an;
-
-    if(an < 22.5f || an >= 157.5f)
-    {
-        q_an = 0;
-    }
-    else if(an < 67.5f)
-    {
-        q_an = 1;
-    }
-    else if(an < 112.5f)
-    {
-        q_an = 2;
-    }
-    else
-    {
-        q_an = 3;
-    }
-
-    // Find the two pixels in the perpendicular direction
-    short2       x_p = neighbours_coords[q_an].s02;
-    short2       y_p = neighbours_coords[q_an].s13;
-    DATA_TYPE_IN g1  = *((global DATA_TYPE_IN *)offset(&grad, x_p.x, y_p.x));
-    DATA_TYPE_IN g2  = *((global DATA_TYPE_IN *)offset(&grad, x_p.y, y_p.y));
-
-    if((gradient > g1) && (gradient > g2))
-    {
-        __global uchar *non_max_addr            = non_max_ptr + non_max_offset_first_element_in_bytes + x * non_max_stride_x + y * non_max_stride_y;
-        *((global DATA_TYPE_OUT *)non_max_addr) = gradient;
-    }
-}
-
-#define hysteresis_local_stack_L1 8  // The size of level 1 stack. This has to agree with the host side
-#define hysteresis_local_stack_L2 16 // The size of level 2 stack, adjust this can impact the match rate with VX implementation
-
-/** Check whether pixel is valid
- *
- * Skip the pixel if the early_test fails.
- * Otherwise, it tries to add the pixel coordinate to the stack, and proceed to popping the stack instead if the stack is full
- *
- * @param[in] early_test Boolean condition based on the minv check and visited buffer check
- * @param[in] x_pos      X-coordinate of pixel that is going to be recorded, has to be within the boundary
- * @param[in] y_pos      Y-coordinate of pixel that is going to be recorded, has to be within the boundary
- * @param[in] x_cur      X-coordinate of current central pixel
- * @param[in] y_cur      Y-coordinate of current central pixel
- */
-#define check_pixel(early_test, x_pos, y_pos, x_cur, y_cur)                               \
-    {                                                                                     \
-        if(!early_test)                                                                   \
-        {                                                                                 \
-            /* Number of elements in the local stack 1, points to next available entry */ \
-            c = *((__global char *)offset(&l1_stack_counter, x_cur, y_cur));              \
-            \
-            if(c > (hysteresis_local_stack_L1 - 1)) /* Stack level 1 is full */           \
-                goto pop_stack;                                                           \
-            \
-            /* The pixel that has already been recorded is ignored */                     \
-            if(!atomic_or((__global uint *)offset(&recorded, x_pos, y_pos), 1))           \
-            {                                                                             \
-                l1_ptr[c] = (short2)(x_pos, y_pos);                                       \
-                *((__global char *)offset(&l1_stack_counter, x_cur, y_cur)) += 1;         \
-            }                                                                             \
-        }                                                                                 \
-    }
-
-/** Perform hysteresis.
- *
- * @attention The input data_type needs to be passed at compile time using -DDATA_TYPE_IN: e.g. -DDATA_TYPE_IN=short
- *
- * @param[in]  src_ptr                                        Pointer to the input image. Supported data types: U8
- * @param[in]  src_stride_x                                   Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                                     src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                                   Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                                     src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes              The offset of the first element of the output
- * @param[out] out_ptr                                        Pointer to the output image. Supported data types: U8
- * @param[in]  out_stride_x                                   Stride of the source image in X dimension (in bytes)
- * @param[in]  out_step_x                                     out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                                   Stride of the source image in Y dimension (in bytes)
- * @param[in]  out_step_y                                     out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes              The offset of the first element of the output
- * @param[out] visited_ptr                                    Pointer to the visited buffer, where pixels are marked as visited. Supported data types: U32
- * @param[in]  visited_stride_x                               Stride of the source image in X dimension (in bytes)
- * @param[in]  visited_step_x                                 visited_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  visited_stride_y                               Stride of the source image in Y dimension (in bytes)
- * @param[in]  visited_step_y                                 visited_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  visited_offset_first_element_in_bytes          The offset of the first element of the output
- * @param[out] recorded_ptr                                   Pointer to the recorded buffer, where pixels are marked as recorded. Supported data types: U32
- * @param[in]  recorded_stride_x                              Stride of the source image in X dimension (in bytes)
- * @param[in]  recorded_step_x                                recorded_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  recorded_stride_y                              Stride of the source image in Y dimension (in bytes)
- * @param[in]  recorded_step_y                                recorded_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  recorded_offset_first_element_in_bytes         The offset of the first element of the output
- * @param[out] l1_stack_ptr                                   Pointer to the l1 stack of a pixel. Supported data types: S32
- * @param[in]  l1_stack_stride_x                              Stride of the source image in X dimension (in bytes)
- * @param[in]  l1_stack_step_x                                l1_stack_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  l1_stack_stride_y                              Stride of the source image in Y dimension (in bytes)
- * @param[in]  l1_stack_step_y                                l1_stack_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  l1_stack_offset_first_element_in_bytes         The offset of the first element of the output
- * @param[out] l1_stack_counter_ptr                           Pointer to the l1 stack counters of an image. Supported data types: U8
- * @param[in]  l1_stack_counter_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  l1_stack_counter_step_x                        l1_stack_counter_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  l1_stack_counter_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  l1_stack_counter_step_y                        l1_stack_counter_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  l1_stack_counter_offset_first_element_in_bytes The offset of the first element of the output
- * @param[in]  low_thr                                        The lower threshold
- * @param[in]  up_thr                                         The upper threshold
- * @param[in]  width                                          The width of the image.
- * @param[in]  height                                         The height of the image
- */
-kernel void hysteresis(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(out),
-    IMAGE_DECLARATION(visited),
-    IMAGE_DECLARATION(recorded),
-    IMAGE_DECLARATION(l1_stack),
-    IMAGE_DECLARATION(l1_stack_counter),
-    uint low_thr,
-    uint up_thr,
-    int  width,
-    int  height)
-{
-    // Create images
-    Image src              = CONVERT_TO_IMAGE_STRUCT_NO_STEP(src);
-    Image out              = CONVERT_TO_IMAGE_STRUCT_NO_STEP(out);
-    Image visited          = CONVERT_TO_IMAGE_STRUCT_NO_STEP(visited);
-    Image recorded         = CONVERT_TO_IMAGE_STRUCT_NO_STEP(recorded);
-    Image l1_stack         = CONVERT_TO_IMAGE_STRUCT_NO_STEP(l1_stack);
-    Image l1_stack_counter = CONVERT_TO_IMAGE_STRUCT_NO_STEP(l1_stack_counter);
-
-    // Index
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    // Load value
-    DATA_TYPE_IN val = *((__global DATA_TYPE_IN *)offset(&src, x, y));
-
-    // If the pixel has already been marked as NO_EDGE, store that value in the output and return
-    if(val == NO_EDGE)
-    {
-        *offset(&out, x, y) = NO_EDGE;
-        return;
-    }
-
-    // Return if it is a MAYBE pixel. Such pixels will become edges if near a strong edge
-    if(val <= up_thr)
-    {
-        return;
-    }
-
-    // Init local stack 2
-    short2 stack_L2[hysteresis_local_stack_L2] = { 0 };
-    int    L2_counter                          = 0;
-
-    // Perform recursive hysteresis
-    while(true)
-    {
-        // Get L1 stack pointer
-        __global short2 *l1_ptr = (__global short2 *)(l1_stack.ptr + y * l1_stack.stride_y + x * hysteresis_local_stack_L1 * l1_stack.stride_x);
-
-        // If the pixel has already been visited, proceed with the items in the stack instead
-        if(atomic_or((__global uint *)offset(&visited, x, y), 1) != 0)
-        {
-            goto pop_stack;
-        }
-
-        // Set strong edge
-        *offset(&out, x, y) = EDGE;
-
-        // If it is the top of stack l2, we don't need check the surrounding pixels
-        if(L2_counter > (hysteresis_local_stack_L2 - 1))
-        {
-            goto pop_stack2;
-        }
-
-        // Points to the start of the local stack;
-        char c;
-
-        VEC_DATA_TYPE(DATA_TYPE_IN, 4)
-        x_tmp;
-        uint4 v_tmp;
-
-        // Get direction pixel indices
-        int N = max(y - 1, 0), S = min(y + 1, height - 2), W = max(x - 1, 0), E = min(x + 1, width - 2);
-
-        // Check 8 pixels around for weak edges where low_thr < val <= up_thr
-        x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, N));
-        v_tmp = vload4(0, (__global uint *)offset(&visited, W, N));
-        check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, N, x, y); // NW
-        check_pixel(((x_tmp.s1 <= low_thr) || v_tmp.s1 || (x_tmp.s1 > up_thr)), x, N, x, y); // N
-        check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, N, x, y); // NE
-
-        x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, y));
-        v_tmp = vload4(0, (__global uint *)offset(&visited, W, y));
-        check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, y, x, y); // W
-        check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, y, x, y); // E
-
-        x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, S));
-        v_tmp = vload4(0, (__global uint *)offset(&visited, W, S));
-        check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, S, x, y); // SW
-        check_pixel(((x_tmp.s1 <= low_thr) || v_tmp.s1 || (x_tmp.s1 > up_thr)), x, S, x, y); // S
-        check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, S, x, y); // SE
-
-#undef check_pixel
-
-pop_stack:
-        c = *((__global char *)offset(&l1_stack_counter, x, y));
-
-        if(c >= 1)
-        {
-            *((__global char *)offset(&l1_stack_counter, x, y)) -= 1;
-            int2 l_c = convert_int2(l1_ptr[c - 1]);
-
-            // Push the current position into level 2 stack
-            stack_L2[L2_counter].x = x;
-            stack_L2[L2_counter].y = y;
-
-            x = l_c.x;
-            y = l_c.y;
-
-            L2_counter++;
-
-            continue;
-        }
-
-        if(L2_counter > 0)
-        {
-            goto pop_stack2;
-        }
-        else
-        {
-            return;
-        }
-
-pop_stack2:
-        L2_counter--;
-        x = stack_L2[L2_counter].x;
-        y = stack_L2[L2_counter].y;
-    };
-}

diff --git a/src/core/CL/cl_kernels/channel_combine.cl b/src/core/CL/cl_kernels/channel_combine.cl
deleted file mode 100644
index 550d52e..0000000
--- a/src/core/CL/cl_kernels/channel_combine.cl
+++ /dev/null

@@ -1,416 +0,0 @@
-/*
- * Copyright (c) 2016-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function combines three planes to a single RGB image.
- *
- * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
- * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
- * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
- * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
- * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
- * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
- * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
- * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
- * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
- * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
- * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
- * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
- * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: RGB
- * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
- */
-__kernel void channel_combine_RGB888(
-    IMAGE_DECLARATION(plane0),
-    IMAGE_DECLARATION(plane1),
-    IMAGE_DECLARATION(plane2),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
-    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
-    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
-    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data0 = vload16(0, plane0.ptr);
-    uchar16 data1 = vload16(0, plane1.ptr);
-    uchar16 data2 = vload16(0, plane2.ptr);
-
-    uchar16 out0 = (uchar16)(data0.s0, data1.s0, data2.s0,
-                             data0.s1, data1.s1, data2.s1,
-                             data0.s2, data1.s2, data2.s2,
-                             data0.s3, data1.s3, data2.s3,
-                             data0.s4, data1.s4, data2.s4,
-                             data0.s5);
-    vstore16(out0, 0, dst.ptr);
-
-    uchar16 out1 = (uchar16)(data1.s5, data2.s5, data0.s6,
-                             data1.s6, data2.s6, data0.s7,
-                             data1.s7, data2.s7, data0.s8,
-                             data1.s8, data2.s8, data0.s9,
-                             data1.s9, data2.s9, data0.sA,
-                             data1.sA);
-    vstore16(out1, 0, dst.ptr + 16);
-
-    uchar16 out2 = (uchar16)(data2.sA, data0.sB, data1.sB,
-                             data2.sB, data0.sC, data1.sC,
-                             data2.sC, data0.sD, data1.sD,
-                             data2.sD, data0.sE, data1.sE,
-                             data2.sE, data0.sF, data1.sF,
-                             data2.sF);
-    vstore16(out2, 0, dst.ptr + 32);
-}
-
-/** This function combines three planes to a single RGBA image.
- *
- * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
- * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
- * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
- * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
- * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
- * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
- * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
- * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
- * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
- * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
- * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
- * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
- * @param[in] plane3_ptr                           Pointer to the fourth plane. Supported Format: U8
- * @param[in] plane3_stride_x                      Stride of the fourth plane in X dimension (in bytes)
- * @param[in] plane3_step_x                        plane3_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane3_stride_y                      Stride of the fourth plane in Y dimension (in bytes)
- * @param[in] plane3_step_y                        plane3_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane3_offset_first_element_in_bytes The offset of the first element in the fourth plane
- * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: RGBA
- * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
- */
-__kernel void channel_combine_RGBA8888(
-    IMAGE_DECLARATION(plane0),
-    IMAGE_DECLARATION(plane1),
-    IMAGE_DECLARATION(plane2),
-    IMAGE_DECLARATION(plane3),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
-    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
-    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
-    Image plane3 = CONVERT_TO_IMAGE_STRUCT(plane3);
-    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data0 = vload16(0, plane0.ptr);
-    uchar16 data1 = vload16(0, plane1.ptr);
-    uchar16 data2 = vload16(0, plane2.ptr);
-    uchar16 data3 = vload16(0, plane3.ptr);
-
-    uchar16 out0 = (uchar16)(data0.s0, data1.s0, data2.s0, data3.s0,
-                             data0.s1, data1.s1, data2.s1, data3.s1,
-                             data0.s2, data1.s2, data2.s2, data3.s2,
-                             data0.s3, data1.s3, data2.s3, data3.s3);
-    vstore16(out0, 0, dst.ptr);
-
-    uchar16 out1 = (uchar16)(data0.s4, data1.s4, data2.s4, data3.s4,
-                             data0.s5, data1.s5, data2.s5, data3.s5,
-                             data0.s6, data1.s6, data2.s6, data3.s6,
-                             data0.s7, data1.s7, data2.s7, data3.s7);
-    vstore16(out1, 0, dst.ptr + 16);
-
-    uchar16 out2 = (uchar16)(data0.s8, data1.s8, data2.s8, data3.s8,
-                             data0.s9, data1.s9, data2.s9, data3.s9,
-                             data0.sA, data1.sA, data2.sA, data3.sA,
-                             data0.sB, data1.sB, data2.sB, data3.sB);
-    vstore16(out2, 0, dst.ptr + 32);
-
-    uchar16 out3 = (uchar16)(data0.sC, data1.sC, data2.sC, data3.sC,
-                             data0.sD, data1.sD, data2.sD, data3.sD,
-                             data0.sE, data1.sE, data2.sE, data3.sE,
-                             data0.sF, data1.sF, data2.sF, data3.sF);
-    vstore16(out3, 0, dst.ptr + 48);
-}
-
-/** This function combines three planes to a single YUYV image.
- *
- * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
- * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
- * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
- * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
- * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
- * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
- * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
- * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
- * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
- * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
- * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
- * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
- * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: YUYV
- * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
- */
-__kernel void channel_combine_YUYV422(
-    IMAGE_DECLARATION(plane0),
-    IMAGE_DECLARATION(plane1),
-    IMAGE_DECLARATION(plane2),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
-    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
-    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
-    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data0 = vload16(0, plane0.ptr);
-    uchar8  data1 = vload8(0, plane1.ptr);
-    uchar8  data2 = vload8(0, plane2.ptr);
-
-    uchar16 out0 = (uchar16)(data0.s0, data1.s0, data0.s1, data2.s0,
-                             data0.s2, data1.s1, data0.s3, data2.s1,
-                             data0.s4, data1.s2, data0.s5, data2.s2,
-                             data0.s6, data1.s3, data0.s7, data2.s3);
-    vstore16(out0, 0, dst.ptr);
-    uchar16 out1 = (uchar16)(data0.s8, data1.s4, data0.s9, data2.s4,
-                             data0.sA, data1.s5, data0.sB, data2.s5,
-                             data0.sC, data1.s6, data0.sD, data2.s6,
-                             data0.sE, data1.s7, data0.sF, data2.s7);
-    vstore16(out1, 0, dst.ptr + 16);
-}
-
-/** This function combines three planes to a single UYUV image.
- *
- * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
- * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
- * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
- * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
- * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
- * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
- * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
- * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
- * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
- * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
- * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
- * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
- * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: UYUV
- * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
- */
-__kernel void channel_combine_UYVY422(
-    IMAGE_DECLARATION(plane0),
-    IMAGE_DECLARATION(plane1),
-    IMAGE_DECLARATION(plane2),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
-    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
-    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
-    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data0 = vload16(0, plane0.ptr);
-    uchar8  data1 = vload8(0, plane1.ptr);
-    uchar8  data2 = vload8(0, plane2.ptr);
-
-    uchar16 out0 = (uchar16)(data1.s0, data0.s0, data2.s0, data0.s1,
-                             data1.s1, data0.s2, data2.s1, data0.s3,
-                             data1.s2, data0.s4, data2.s2, data0.s5,
-                             data1.s3, data0.s6, data2.s3, data0.s7);
-    vstore16(out0, 0, dst.ptr);
-    uchar16 out1 = (uchar16)(data1.s4, data0.s8, data2.s4, data0.s9,
-                             data1.s5, data0.sA, data2.s5, data0.sB,
-                             data1.s6, data0.sC, data2.s6, data0.sD,
-                             data1.s7, data0.sE, data2.s7, data0.sF);
-    vstore16(out1, 0, dst.ptr + 16);
-}
-
-/** This function combines three planes to a single NV12/NV21 image.
- *
- * @note NV12 or NV21 has to be specified through preprocessor macro. eg. -DNV12 performs NV12 channel combine.
- *
- * @param[in] src_plane0_ptr                           Pointer to the first plane. Supported Format: U8
- * @param[in] src_plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
- * @param[in] src_plane0_step_x                        src_plane0_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
- * @param[in] src_plane0_step_y                        src_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_plane0_offset_first_element_in_bytes The offset of the first element in the first plane
- * @param[in] src_plane1_ptr                           Pointer to the second plane. Supported Format: U8
- * @param[in] src_plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
- * @param[in] src_plane1_step_x                        src_plane1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
- * @param[in] src_plane1_step_y                        src_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_plane1_offset_first_element_in_bytes The offset of the first element in the second plane
- * @param[in] src_plane2_ptr                           Pointer to the third plane. Supported Format: U8
- * @param[in] src_plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
- * @param[in] src_plane2_step_x                        src_plane2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
- * @param[in] src_plane2_step_y                        src_plane2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_plane2_offset_first_element_in_bytes The offset of the first element in the third plane
- * @param[in] dst_plane0_ptr                           Pointer to the first plane of the destination image. Supported Format: U8
- * @param[in] dst_plane0_stride_x                      Stride of the first plane of the destination image in X dimension (in bytes)
- * @param[in] dst_plane0_step_x                        dst_plane0_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_plane0_stride_y                      Stride of the first plane of the destination image in Y dimension (in bytes)
- * @param[in] dst_plane0_step_y                        dst_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_plane0_offset_first_element_in_bytes The offset of the first element in the first plane of the destination image
- * @param[in] dst_plane1_ptr                           Pointer to the second plane of the destination image. Supported Format: UV88
- * @param[in] dst_plane1_stride_x                      Stride of the second plane of the destination image in X dimension (in bytes)
- * @param[in] dst_plane1_step_x                        dst_plane1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_plane1_stride_y                      Stride of the second plane of the destination image in Y dimension (in bytes)
- * @param[in] dst_plane1_step_y                        dst_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_plane1_offset_first_element_in_bytes The offset of the first element in the second plane of the destination image
- * @param[in] height                                   Sub-sampled height
- */
-__kernel void channel_combine_NV(
-    IMAGE_DECLARATION(src_plane0),
-    IMAGE_DECLARATION(src_plane1),
-    IMAGE_DECLARATION(src_plane2),
-    IMAGE_DECLARATION(dst_plane0),
-    IMAGE_DECLARATION(dst_plane1),
-    uint height)
-{
-    // Get pixels pointer
-    Image src_plane0 = CONVERT_TO_IMAGE_STRUCT(src_plane0);
-    Image src_plane1 = CONVERT_TO_IMAGE_STRUCT(src_plane1);
-    Image src_plane2 = CONVERT_TO_IMAGE_STRUCT(src_plane2);
-    Image dst_plane0 = CONVERT_TO_IMAGE_STRUCT(dst_plane0);
-    Image dst_plane1 = CONVERT_TO_IMAGE_STRUCT(dst_plane1);
-
-    // Copy plane data
-    vstore16(vload16(0, src_plane0.ptr), 0, dst_plane0.ptr);
-    vstore16(vload16(0, offset(&src_plane0, 0, height)), 0, (__global uchar *)offset(&dst_plane0, 0, height));
-
-    // Create UV place
-    uchar8 data1 = vload8(0, src_plane1.ptr);
-    uchar8 data2 = vload8(0, src_plane2.ptr);
-
-#ifdef NV12
-    vstore16(shuffle2(data1, data2, (uchar16)(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)), 0, dst_plane1.ptr);
-#elif defined(NV21)
-    vstore16(shuffle2(data2, data1, (uchar16)(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)), 0, dst_plane1.ptr);
-#endif /* NV12 or NV21 */
-}
-
-/** This function combines three planes to a single YUV444 or IYUV image.
- *
- * @note YUV444 or IYUV has to be specified through preprocessor macro. eg. -DIYUV performs IYUV channel combine.
- *
- * @param[in] src_plane0_ptr                           Pointer to the first plane. Supported Format: U8
- * @param[in] src_plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
- * @param[in] src_plane0_step_x                        src_plane0_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
- * @param[in] src_plane0_step_y                        src_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_plane0_offset_first_element_in_bytes The offset of the first element in the first plane
- * @param[in] src_plane1_ptr                           Pointer to the second plane. Supported Format: U8
- * @param[in] src_plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
- * @param[in] src_plane1_step_x                        src_plane1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
- * @param[in] src_plane1_step_y                        src_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_plane1_offset_first_element_in_bytes The offset of the first element in the second plane
- * @param[in] src_plane2_ptr                           Pointer to the third plane. Supported Format: U8
- * @param[in] src_plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
- * @param[in] src_plane2_step_x                        src_plane2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
- * @param[in] src_plane2_step_y                        src_plane2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_plane2_offset_first_element_in_bytes The offset of the first element in the third plane
- * @param[in] dst_plane0_ptr                           Pointer to the first plane of the destination image. Supported Format: U8
- * @param[in] dst_plane0_stride_x                      Stride of the first plane of the destination image in X dimension (in bytes)
- * @param[in] dst_plane0_step_x                        dst_plane0_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_plane0_stride_y                      Stride of the first plane of the destination image in Y dimension (in bytes)
- * @param[in] dst_plane0_step_y                        dst_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_plane0_offset_first_element_in_bytes The offset of the first element in the first plane of the destination image
- * @param[in] dst_plane1_ptr                           Pointer to the second plane of the destination image. Supported Format: U8
- * @param[in] dst_plane1_stride_x                      Stride of the second plane of the destination image in X dimension (in bytes)
- * @param[in] dst_plane1_step_x                        dst_plane1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_plane1_stride_y                      Stride of the second plane of the destination image in Y dimension (in bytes)
- * @param[in] dst_plane1_step_y                        dst_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_plane1_offset_first_element_in_bytes The offset of the first element in the second plane of the destination image
- * @param[in] dst_plane2_ptr                           Pointer to the third plane of the destination image. Supported Format: U8
- * @param[in] dst_plane2_stride_x                      Stride of the third plane of the destination image in X dimension (in bytes)
- * @param[in] dst_plane2_step_x                        dst_plane2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_plane2_stride_y                      Stride of the third plane of the destination image in Y dimension (in bytes)
- * @param[in] dst_plane2_step_y                        dst_plane2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_plane2_offset_first_element_in_bytes The offset of the first element in the third plane of the destination image
- * @param[in] height                                   Sub-sampled height
- */
-__kernel void copy_planes_3p(
-    IMAGE_DECLARATION(src_plane0),
-    IMAGE_DECLARATION(src_plane1),
-    IMAGE_DECLARATION(src_plane2),
-    IMAGE_DECLARATION(dst_plane0),
-    IMAGE_DECLARATION(dst_plane1),
-    IMAGE_DECLARATION(dst_plane2),
-    uint height)
-{
-    // Get pixels pointer
-    Image src_plane0 = CONVERT_TO_IMAGE_STRUCT(src_plane0);
-    Image src_plane1 = CONVERT_TO_IMAGE_STRUCT(src_plane1);
-    Image src_plane2 = CONVERT_TO_IMAGE_STRUCT(src_plane2);
-    Image dst_plane0 = CONVERT_TO_IMAGE_STRUCT(dst_plane0);
-    Image dst_plane1 = CONVERT_TO_IMAGE_STRUCT(dst_plane1);
-    Image dst_plane2 = CONVERT_TO_IMAGE_STRUCT(dst_plane2);
-
-    // Copy plane data
-    vstore16(vload16(0, src_plane0.ptr), 0, dst_plane0.ptr);
-#ifdef YUV444
-    vstore16(vload16(0, src_plane1.ptr), 0, dst_plane1.ptr);
-    vstore16(vload16(0, src_plane2.ptr), 0, dst_plane2.ptr);
-#elif defined(IYUV)
-    vstore16(vload16(0, offset(&src_plane0, 0, height)), 0, (__global uchar *)offset(&dst_plane0, 0, height));
-    vstore8(vload8(0, src_plane1.ptr), 0, dst_plane1.ptr);
-    vstore8(vload8(0, src_plane2.ptr), 0, dst_plane2.ptr);
-#endif /* YUV444 or IYUV */
-}

diff --git a/src/core/CL/cl_kernels/channel_extract.cl b/src/core/CL/cl_kernels/channel_extract.cl
deleted file mode 100644
index b64f248..0000000
--- a/src/core/CL/cl_kernels/channel_extract.cl
+++ /dev/null

@@ -1,272 +0,0 @@
-/*
- * Copyright (c) 2016-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function extracts a given channel from an RGB image.
- *
- * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_B will extract the B channel.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported Format: RGB
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void channel_extract_RGB888(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data  = vload16(0, src.ptr);
-    uchar8  data2 = vload8(0, src.ptr + 16);
-
-#ifdef CHANNEL_R
-    vstore4(data.s0369, 0, dst.ptr);
-    vstore4((uchar4)(data.sCF, data2.s25), 0, dst.ptr + 4);
-#elif defined(CHANNEL_G)
-    vstore4(data.s147A, 0, dst.ptr);
-    vstore4((uchar4)(data.sD, data2.s036), 0, dst.ptr + 4);
-#elif defined(CHANNEL_B)
-    vstore4(data.s258B, 0, dst.ptr);
-    vstore4((uchar4)(data.sE, data2.s147), 0, dst.ptr + 4);
-#endif /* CHANNEL_R or CHANNEL_G or CHANNEL_B */
-}
-
-/** This function extracts a given channel from an RGBA image.
- *
- * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_B will extract the B channel.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported Format: RGBA
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void channel_extract_RGBA8888(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data  = vload16(0, src.ptr);
-    uchar16 data2 = vload16(0, src.ptr + 16);
-
-#ifdef CHANNEL_R
-    vstore8((uchar8)(data.s048C, data2.s048C), 0, dst.ptr);
-#elif defined(CHANNEL_G)
-    vstore8((uchar8)(data.s159D, data2.s159D), 0, dst.ptr);
-#elif defined(CHANNEL_B)
-    vstore8((uchar8)(data.s26AE, data2.s26AE), 0, dst.ptr);
-#elif defined(CHANNEL_A)
-    vstore8((uchar8)(data.s37BF, data2.s37BF), 0, dst.ptr);
-#endif /* CHANNEL_R or CHANNEL_G or CHANNEL_B or CHANNEL_A */
-}
-
-/** This function extracts a given channel from an YUYV image.
- *
- * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported Format: YUYV
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void channel_extract_YUYV422(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data = vload16(0, src.ptr);
-
-#ifdef CHANNEL_Y
-    vstore8(data.s02468ACE, 0, dst.ptr);
-#elif defined(CHANNEL_U)
-    vstore4(data.s159D, 0, dst.ptr);
-#elif defined(CHANNEL_V)
-    vstore4(data.s37BF, 0, dst.ptr);
-#endif /* CHANNEL_Y or CHANNEL_U or CHANNEL_V */
-}
-
-/** This function extracts a given channel from an UYUV image.
- *
- * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported Format: UYUV
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void channel_extract_UYVY422(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data = vload16(0, src.ptr);
-
-#ifdef CHANNEL_Y
-    vstore8(data.s13579BDF, 0, dst.ptr);
-#elif defined(CHANNEL_U)
-    vstore4(data.s048C, 0, dst.ptr);
-#elif defined(CHANNEL_V)
-    vstore4(data.s26AE, 0, dst.ptr);
-#endif /* CHANNEL_Y or CHANNEL_U or CHANNEL_V */
-}
-
-/** This function extracts a given channel from an NV12 image.
- *
- * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
- * @warning Only channels UV can be extracted using this kernel.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported Format: NV12 (UV88)
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void channel_extract_NV12(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data = vload16(0, src.ptr);
-
-#ifdef CHANNEL_U
-    vstore8(data.s02468ACE, 0, dst.ptr);
-#elif defined(CHANNEL_V)
-    vstore8(data.s13579BDF, 0, dst.ptr);
-#endif /* CHANNEL_U or CHANNEL_V */
-}
-
-/** This function extracts a given channel from an NV21 image.
- *
- * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
- * @warning Only channels UV can be extracted using this kernel.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported Format: NV21 (UV88)
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void channel_extract_NV21(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data = vload16(0, src.ptr);
-
-#ifdef CHANNEL_U
-    vstore8(data.s13579BDF, 0, dst.ptr);
-#elif defined(CHANNEL_V)
-    vstore8(data.s02468ACE, 0, dst.ptr);
-#endif /* CHANNEL_U or CHANNEL_V */
-}
-
-/** This function extracts a given plane from an multi-planar image.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported Format: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void copy_plane(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Copy plane data
-    vstore8(vload8(0, src.ptr), 0, dst.ptr);
-}

diff --git a/src/core/CL/cl_kernels/color_convert.cl b/src/core/CL/cl_kernels/color_convert.cl
deleted file mode 100644
index cbebc88..0000000
--- a/src/core/CL/cl_kernels/color_convert.cl
+++ /dev/null

@@ -1,1911 +0,0 @@
-/*
- * Copyright (c) 2016-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Convert an RGB888 image to RGBX8888
- *
- * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
- * No offset.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void RGB888_to_RGBA8888_bt709(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-
-    // handle 16 pixels every time
-    uchar16 rgb_0 = vload16(0, in.ptr);
-    uchar16 rgb_1 = vload16(0, in.ptr + 16);
-    uchar16 rgb_2 = vload16(0, in.ptr + 32);
-
-    uchar16 rgba_0 = (uchar16)(rgb_0.s012, 255, rgb_0.s345, 255, rgb_0.s678, 255, rgb_0.s9ab, 255);
-    uchar16 rgba_1 = (uchar16)(rgb_0.scde, 255, rgb_0.sf, rgb_1.s01, 255, rgb_1.s234, 255, rgb_1.s567, 255);
-    uchar16 rgba_2 = (uchar16)(rgb_1.s89a, 255, rgb_1.sbcd, 255, rgb_1.sef, rgb_2.s0, 255, rgb_2.s123, 255);
-    uchar16 rgba_3 = (uchar16)(rgb_2.s456, 255, rgb_2.s789, 255, rgb_2.sabc, 255, rgb_2.sdef, 255);
-
-    vstore16(rgba_0, 0, out.ptr);
-    vstore16(rgba_1, 0, out.ptr + 16);
-    vstore16(rgba_2, 0, out.ptr + 32);
-    vstore16(rgba_3, 0, out.ptr + 48);
-}
-
-/** Convert an RGB888 image to U8
- *
- * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
- * No offset.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported Format: RGB888
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void RGB888_to_U8_bt709(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-
-    // handle 16 pixels every time
-    const uchar16 rgb_0 = vload16(0, in.ptr);
-    const uchar16 rgb_1 = vload16(0, in.ptr + 16);
-    const uchar16 rgb_2 = vload16(0, in.ptr + 32);
-
-    //Resequence values from a sequence of 16 RGB values to sequence of 16 R, 16 G, 16 B values
-    const uchar16 rgb_r = (uchar16)(rgb_0.s0369, rgb_0.scf, rgb_1.s258b, rgb_1.se, rgb_2.s147a, rgb_2.sd);
-    const uchar16 rgb_g = (uchar16)(rgb_0.s147a, rgb_0.sd, rgb_1.s0369, rgb_1.scf, rgb_2.s258b, rgb_2.se);
-    const uchar16 rgb_b = (uchar16)(rgb_0.s258b, rgb_0.se, rgb_1.s147a, rgb_1.sd, rgb_2.s0369, rgb_2.scf);
-
-    const float16 rgb2u8_red_coef_bt709   = 0.2126f;
-    const float16 rgb2u8_green_coef_bt709 = 0.7152f;
-    const float16 rgb2u8_blue_coef_bt709  = 0.0722f;
-
-    //Computation of 16 greyscale values in float
-    const float16 greyscale_f_0 = rgb2u8_red_coef_bt709 * convert_float16(rgb_r) + rgb2u8_green_coef_bt709 * convert_float16(rgb_g) + rgb2u8_blue_coef_bt709 * convert_float16(rgb_b);
-
-    //Convert it to 16 grayscale uchar values
-    const uchar16 greyscale_u8_0 = convert_uchar16_sat_rtz(greyscale_f_0);
-
-    vstore16(greyscale_u8_0, 0, out.ptr);
-}
-
-/** Convert an RGB888 image to RGBX8888
- *
- * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
- * No offset.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void RGBA8888_to_RGB888_bt709(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-    // handle 16 pixels every time
-    uchar16 rgba_0 = vload16(0, in.ptr);
-    uchar16 rgba_1 = vload16(0, in.ptr + 16);
-    uchar16 rgba_2 = vload16(0, in.ptr + 32);
-    uchar16 rgba_3 = vload16(0, in.ptr + 48);
-
-    uchar16 rgb_0 = (uchar16)(rgba_0.s01245689, rgba_0.sacde, rgba_1.s0124);
-    uchar16 rgb_1 = (uchar16)(rgba_1.s5689acde, rgba_2.s01245689);
-    uchar16 rgb_2 = (uchar16)(rgba_2.sacde, rgba_3.s01245689, rgba_3.sacde);
-
-    vstore16(rgb_0, 0, out.ptr);
-    vstore16(rgb_1, 0, out.ptr + 16);
-    vstore16(rgb_2, 0, out.ptr + 32);
-}
-
-/** Convert a UYVY422 image to RGB888 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
- * No offset.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void UYVY422_to_RGB888_bt709(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-
-    // handle 8 pixels every time
-    uchar16 uyvy = vload16(0, in.ptr);
-
-    uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
-    char8  cb   = (char8)(uyvy.s0, uyvy.s0, uyvy.s4, uyvy.s4, uyvy.s8, uyvy.s8, uyvy.sc, uyvy.sc) - (char8)(128);
-    char8  cr   = (char8)(uyvy.s2, uyvy.s2, uyvy.s6, uyvy.s6, uyvy.sa, uyvy.sa, uyvy.se, uyvy.se) - (char8)(128);
-
-    float8 red_coef_bt709    = (float8)(1.5748f);
-    float8 green_coef_bt709  = (float8)(-0.1873f);
-    float8 green_coef2_bt709 = (float8)(-0.4681f);
-    float8 blue_coef_bt709   = (float8)(1.8556f);
-    float8 lumav             = convert_float8(luma);
-
-    float8 f_r = red_coef_bt709 * convert_float8(cr);
-    float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr);
-    float8 f_b = blue_coef_bt709 * convert_float8(cb);
-
-    f_r += lumav;
-    f_g += lumav;
-    f_b += lumav;
-
-    uchar8 r_0 = convert_uchar8_sat_rtz(f_r);
-    uchar8 g_0 = convert_uchar8_sat_rtz(f_g);
-    uchar8 b_0 = convert_uchar8_sat_rtz(f_b);
-
-    uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2,
-                              r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5);
-    uchar8 rgb_1 = (uchar8)(g_0.s5, b_0.s5, r_0.s6, g_0.s6, b_0.s6, r_0.s7, g_0.s7, b_0.s7);
-
-    vstore16(rgb_0, 0, out.ptr);
-    vstore8(rgb_1, 0, out.ptr + 16);
-}
-
-/** Convert a UYVY422 image to RGBX8888 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
- * No offset.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void UYVY422_to_RGBA8888_bt709(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-
-    // handle 8 pixels every time
-    uchar16 uyvy = vload16(0, in.ptr);
-
-    uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
-    char8  cb   = (char8)(uyvy.s0, uyvy.s0, uyvy.s4, uyvy.s4, uyvy.s8, uyvy.s8, uyvy.sc, uyvy.sc) - (char8)(128);
-    char8  cr   = (char8)(uyvy.s2, uyvy.s2, uyvy.s6, uyvy.s6, uyvy.sa, uyvy.sa, uyvy.se, uyvy.se) - (char8)(128);
-
-    float8 red_coef_bt709    = (float8)(1.5748f);
-    float8 green_coef_bt709  = (float8)(-0.1873f);
-    float8 green_coef2_bt709 = (float8)(-0.4681f);
-    float8 blue_coef_bt709   = (float8)(1.8556f);
-    float8 lumav             = convert_float8(luma);
-
-    float8 f_r = red_coef_bt709 * convert_float8(cr);
-    float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr);
-    float8 f_b = blue_coef_bt709 * convert_float8(cb);
-
-    f_r += lumav;
-    f_g += lumav;
-    f_b += lumav;
-
-    uchar8 r_0 = convert_uchar8_sat_rtz(f_r);
-    uchar8 g_0 = convert_uchar8_sat_rtz(f_g);
-    uchar8 b_0 = convert_uchar8_sat_rtz(f_b);
-
-    uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255,
-                               r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
-    uchar16 rgba_1 = (uchar16)(r_0.s4, g_0.s4, b_0.s4, 255, r_0.s5, g_0.s5, b_0.s5, 255,
-                               r_0.s6, g_0.s6, b_0.s6, 255, r_0.s7, g_0.s7, b_0.s7, 255);
-
-    vstore16(rgba_0, 0, out.ptr);
-    vstore16(rgba_1, 0, out.ptr + 16);
-}
-
-/** Convert a YUYV422 image to RGB888 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
- * No offset.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void YUYV422_to_RGB888_bt709(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-
-    // handle 8 pixels every time
-    uchar16 uyvy = vload16(0, in.ptr);
-
-    uchar8 luma = (uchar8)(uyvy.s0, uyvy.s2, uyvy.s4, uyvy.s6, uyvy.s8, uyvy.sa, uyvy.sc, uyvy.se);
-    char8  cb   = (char8)(uyvy.s1, uyvy.s1, uyvy.s5, uyvy.s5, uyvy.s9, uyvy.s9, uyvy.sd, uyvy.sd) - (char8)(128);
-    char8  cr   = (char8)(uyvy.s3, uyvy.s3, uyvy.s7, uyvy.s7, uyvy.sb, uyvy.sb, uyvy.sf, uyvy.sf) - (char8)(128);
-
-    float8 red_coef_bt709    = (float8)(1.5748f);
-    float8 green_coef_bt709  = (float8)(-0.1873f);
-    float8 green_coef2_bt709 = (float8)(-0.4681f);
-    float8 blue_coef_bt709   = (float8)(1.8556f);
-    float8 lumav             = convert_float8(luma);
-
-    float8 f_r = red_coef_bt709 * convert_float8(cr);
-    float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr);
-    float8 f_b = blue_coef_bt709 * convert_float8(cb);
-
-    f_r += lumav;
-    f_g += lumav;
-    f_b += lumav;
-
-    uchar8 r_0 = convert_uchar8_sat_rtz(f_r);
-    uchar8 g_0 = convert_uchar8_sat_rtz(f_g);
-    uchar8 b_0 = convert_uchar8_sat_rtz(f_b);
-
-    uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2,
-                              r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5);
-    uchar8 rgb_1 = (uchar8)(g_0.s5, b_0.s5, r_0.s6, g_0.s6, b_0.s6, r_0.s7, g_0.s7, b_0.s7);
-
-    vstore16(rgb_0, 0, out.ptr);
-    vstore8(rgb_1, 0, out.ptr + 16);
-}
-
-/** Convert a YUYV422 image to RGBX8888 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
- * No offset.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void YUYV422_to_RGBA8888_bt709(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-
-    // handle 8 pixels every time
-    uchar16 uyvy = vload16(0, in.ptr);
-
-    uchar8 luma = (uchar8)(uyvy.s0, uyvy.s2, uyvy.s4, uyvy.s6, uyvy.s8, uyvy.sa, uyvy.sc, uyvy.se);
-    char8  cb   = (char8)(uyvy.s1, uyvy.s1, uyvy.s5, uyvy.s5, uyvy.s9, uyvy.s9, uyvy.sd, uyvy.sd) - (char8)(128);
-    char8  cr   = (char8)(uyvy.s3, uyvy.s3, uyvy.s7, uyvy.s7, uyvy.sb, uyvy.sb, uyvy.sf, uyvy.sf) - (char8)(128);
-
-    float8 red_coef_bt709    = (float8)(1.5748f);
-    float8 green_coef_bt709  = (float8)(-0.1873f);
-    float8 green_coef2_bt709 = (float8)(-0.4681f);
-    float8 blue_coef_bt709   = (float8)(1.8556f);
-    float8 lumav             = convert_float8(luma);
-
-    float8 f_r = red_coef_bt709 * convert_float8(cr);
-    float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr);
-    float8 f_b = blue_coef_bt709 * convert_float8(cb);
-
-    f_r += lumav;
-    f_g += lumav;
-    f_b += lumav;
-
-    uchar8 r_0 = convert_uchar8_sat_rtz(f_r);
-    uchar8 g_0 = convert_uchar8_sat_rtz(f_g);
-    uchar8 b_0 = convert_uchar8_sat_rtz(f_b);
-
-    uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255,
-                               r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
-    uchar16 rgba_1 = (uchar16)(r_0.s4, g_0.s4, b_0.s4, 255, r_0.s5, g_0.s5, b_0.s5, 255,
-                               r_0.s6, g_0.s6, b_0.s6, 255, r_0.s7, g_0.s7, b_0.s7, 255);
-
-    vstore16(rgba_0, 0, out.ptr);
-    vstore16(rgba_1, 0, out.ptr + 16);
-}
-
-/** Convert a RGB image to NV12 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  input_ptr                           Pointer to the source image. Supported Format: U8
- * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] luma_ptr                            Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_stride_x                       Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_step_x                         luma_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_stride_y                       Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_step_y                         luma_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_offset_first_element_in_bytes  The offset of the first element in the destination image luma channel
- * @param[out] uv_ptr                              Pointer to the destination uv channel. Supported Format: U8
- * @param[in]  uv_stride_x                         Stride of the destination uv channel in X dimension (in bytes)
- * @param[in]  uv_step_x                           uv_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_stride_y                         Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  uv_step_y                           uv_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_offset_first_element_in_bytes    The offset of the first element in the destination image uv channel
- *
- */
-__kernel void RGB888_to_NV12_bt709(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(luma),
-    IMAGE_DECLARATION(uv))
-{
-    Image in     = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma);
-    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv);
-
-    // handle 4 pixels every time, two lines, each line for 2 pixels
-    // Read 2 pixel of the first line
-    uchar8 rgb_0 = vload8(0, in.ptr);
-    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s3);
-    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s4);
-    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s5);
-
-    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
-    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
-    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
-
-    short2 i_y = convert_short2_rtz(f_y);
-    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
-    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
-
-    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
-    vstore2(luma_0, 0, out_y.ptr);
-
-    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
-    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
-
-    // Read 2 pixel of the second line
-    uchar8 rgb_1 = vload8(0, in.ptr + input_stride_y);
-    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s3);
-    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s4);
-    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s5);
-
-    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
-    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
-    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
-
-    i_y = convert_short2_rtz(f_y);
-    i_u = convert_short2_rtz(f_u) + (short2)(128);
-    i_v = convert_short2_rtz(f_v) + (short2)(128);
-
-    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
-    vstore2(luma_1, 0, out_y.ptr + luma_stride_y);
-
-    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
-    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
-    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
-                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
-
-    vstore2(cbcr, 0, out_uv.ptr);
-}
-
-/*
-    R'= Y' + 0.0000*U + 1.5748*V
-    G'= Y' - 0.1873*U - 0.4681*V
-    B'= Y' + 1.8556*U + 0.0000*V
-*/
-
-/** Convert an NV12 image to RGB888
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  uv_input_ptr                             Pointer to the source uv channel. Supported Format: U8
- * @param[in]  uv_input_stride_x                        Stride of the source image uv channel in X dimension (in bytes)
- * @param[in]  uv_input_step_x                          uv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_input_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  uv_input_step_y                          uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_input_offset_first_element_in_bytes   The offset of the first element in the source image
- * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  rgb_output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void NV12_to_RGB888_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(uv_input),
-    IMAGE_DECLARATION(rgb_output))
-{
-    Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
-    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
-
-    // handle 8 pixels every time, two lines, each line for 4 pixels
-    uchar4 luma_0 = vload4(0, in_luma.ptr);
-    uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
-    uchar4 cbcr   = vload4(0, in_uv.ptr);
-    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
-    char4  cr     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
-
-    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
-    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
-    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
-
-    float4 f_r = convert_float4(luma_0) + temp0;
-    float4 f_g = convert_float4(luma_0) + temp1;
-    float4 f_b = convert_float4(luma_0) + temp2;
-
-    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
-    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
-    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
-
-    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
-    uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
-    vstore8(rgb_0, 0, out_rgb.ptr);
-    vstore4(rgb_1, 0, out_rgb.ptr + 8);
-
-    f_r = convert_float4(luma_1) + temp0;
-    f_g = convert_float4(luma_1) + temp1;
-    f_b = convert_float4(luma_1) + temp2;
-
-    r_0 = convert_uchar4_sat_rtz(f_r);
-    g_0 = convert_uchar4_sat_rtz(f_g);
-    b_0 = convert_uchar4_sat_rtz(f_b);
-
-    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
-    rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
-    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
-    vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
-}
-
-/** Convert a RGB image to YUV444 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  rgb_input_ptr                             Pointer to the source image. Supported Format: U8
- * @param[in]  rgb_input_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  rgb_input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgb_input_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  rgb_input_step_y                          rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgb_input_offset_first_element_in_bytes   The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination image V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- *
- */
-__kernel void RGB888_to_YUV444_bt709(
-    IMAGE_DECLARATION(rgb_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    // handle 4 pixels every time
-    Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_input);
-    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u  = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v  = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // Read 4 pixel
-    uchar16 rgb_0 = vload16(0, in_rgb.ptr);
-    uchar4  r_0   = (uchar4)(rgb_0.s0, rgb_0.s3, rgb_0.s6, rgb_0.s9);
-    uchar4  g_0   = (uchar4)(rgb_0.s1, rgb_0.s4, rgb_0.s7, rgb_0.sa);
-    uchar4  b_0   = (uchar4)(rgb_0.s2, rgb_0.s5, rgb_0.s8, rgb_0.sb);
-
-    float4 f_y = (float4)(0.0000f) + (float4)(0.2126f) * convert_float4(r_0) + (float4)(0.7152f) * convert_float4(g_0) + (float4)(0.0722f) * convert_float4(b_0);
-    float4 f_u = (float4)(0.0000f) - (float4)(0.1146f) * convert_float4(r_0) - (float4)(0.3854f) * convert_float4(g_0) + (float4)(0.5000f) * convert_float4(b_0);
-    float4 f_v = (float4)(0.0000f) + (float4)(0.5000f) * convert_float4(r_0) - (float4)(0.4542f) * convert_float4(g_0) - (float4)(0.0458f) * convert_float4(b_0);
-
-    short4 i_y = convert_short4_rtz(f_y);
-    short4 i_u = convert_short4_rtz(f_u) + (short4)(128);
-    short4 i_v = convert_short4_rtz(f_v) + (short4)(128);
-
-    uchar4 luma_0 = convert_uchar4(max((short4)(0), min(i_y, (short4)(255))));
-    vstore4(luma_0, 0, out_y.ptr);
-
-    uchar4 cb_0 = convert_uchar4(max((short4)(0), min(i_u, (short4)(255))));
-    uchar4 cr_0 = convert_uchar4(max((short4)(0), min(i_v, (short4)(255))));
-    vstore4(cb_0, 0, out_u.ptr);
-    vstore4(cr_0, 0, out_v.ptr);
-}
-
-/** Convert a RGB image to IYUV using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
- * No offset.
- *
- * @param[in]  rgb_input_ptr                             Pointer to the source image. Supported Format: U8
- * @param[in]  rgb_input_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  rgb_input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgb_input_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  rgb_input_step_y                          rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgb_input_offset_first_element_in_bytes   The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- *
- */
-__kernel void RGB888_to_IYUV_bt709(
-    IMAGE_DECLARATION(rgb_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    // handle 4 pixels every time, two lines, each line for 2 pixels
-    Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_input);
-    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u  = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v  = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // Read 2 pixel of the first line
-    uchar8 rgb_0 = vload8(0, in_rgb.ptr);
-    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s3);
-    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s4);
-    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s5);
-
-    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
-    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
-    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
-
-    short2 i_y = convert_short2_rtz(f_y);
-    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
-    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
-
-    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
-    vstore2(luma_0, 0, out_y.ptr);
-
-    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
-    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
-
-    // Read 2 pixel of the second line
-    uchar8 rgb_1 = vload8(0, in_rgb.ptr + rgb_input_stride_y);
-    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s3);
-    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s4);
-    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s5);
-
-    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
-    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
-    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
-
-    i_y = convert_short2_rtz(f_y);
-    i_u = convert_short2_rtz(f_u) + (short2)(128);
-    i_v = convert_short2_rtz(f_v) + (short2)(128);
-
-    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
-    vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
-
-    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
-    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
-    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
-                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
-    *out_u.ptr = cbcr.x;
-    *out_v.ptr = cbcr.y;
-}
-
-/** Convert a RGBA image to YUV444 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  rgba_input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  rgba_input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  rgba_input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgba_input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  rgba_input_step_y                         rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgba_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination image V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- *
- */
-__kernel void RGBA8888_to_YUV444_bt709(
-    IMAGE_DECLARATION(rgba_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    // handle 4 pixels every time
-    Image in_rgba = CONVERT_TO_IMAGE_STRUCT(rgba_input);
-    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u   = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v   = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // Read 4 pixel
-    uchar16 rgb_0 = vload16(0, in_rgba.ptr);
-    uchar4  r_0   = (uchar4)(rgb_0.s0, rgb_0.s4, rgb_0.s8, rgb_0.sc);
-    uchar4  g_0   = (uchar4)(rgb_0.s1, rgb_0.s5, rgb_0.s9, rgb_0.sd);
-    uchar4  b_0   = (uchar4)(rgb_0.s2, rgb_0.s6, rgb_0.sa, rgb_0.se);
-
-    float4 f_y = (float4)(0.0000f) + (float4)(0.2126f) * convert_float4(r_0) + (float4)(0.7152f) * convert_float4(g_0) + (float4)(0.0722f) * convert_float4(b_0);
-    float4 f_u = (float4)(0.0000f) - (float4)(0.1146f) * convert_float4(r_0) - (float4)(0.3854f) * convert_float4(g_0) + (float4)(0.5000f) * convert_float4(b_0);
-    float4 f_v = (float4)(0.0000f) + (float4)(0.5000f) * convert_float4(r_0) - (float4)(0.4542f) * convert_float4(g_0) - (float4)(0.0458f) * convert_float4(b_0);
-
-    short4 i_y = convert_short4(f_y);
-    short4 i_u = convert_short4(f_u) + (short4)(128);
-    short4 i_v = convert_short4(f_v) + (short4)(128);
-
-    uchar4 luma_0 = convert_uchar4_sat(max((short4)(0), min(i_y, (short4)(255))));
-    vstore4(luma_0, 0, out_y.ptr);
-
-    uchar4 cb_0 = convert_uchar4_sat(max((short4)(0), min(i_u, (short4)(255))));
-    uchar4 cr_0 = convert_uchar4_sat(max((short4)(0), min(i_v, (short4)(255))));
-    vstore4(cb_0, 0, out_u.ptr);
-    vstore4(cr_0, 0, out_v.ptr);
-}
-
-/** Convert a RGBA image to NV12 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
- * No offset.
- *
- * @param[in]  input_ptr                                 Pointer to the source image. Supported Format: U8
- * @param[in]  input_stride_x                            Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                              input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                            Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                              input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes       The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination image luma channel
- * @param[out] uv_output_ptr                             Pointer to the destination uv channel. Supported Format: U8
- * @param[in]  uv_output_stride_x                        Stride of the destination uv channel in X dimension (in bytes)
- * @param[in]  uv_output_step_x                          uv_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_output_stride_y                        Stride of the destination image uv channel in Y dimension (in bytes)
- * @param[in]  uv_output_step_y                          uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_output_offset_first_element_in_bytes   The offset of the first element in the destination image uv channel
- *
- */
-__kernel void RGBA8888_to_NV12_bt709(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(uv_output))
-{
-    Image in     = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output);
-
-    // Read 2 pixel of the first line
-    uchar8 rgb_0 = vload8(0, in.ptr);
-    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s4);
-    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s5);
-    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s6);
-
-    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
-    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
-    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
-
-    short2 i_y = convert_short2_rtz(f_y);
-    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
-    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
-
-    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
-    vstore2(luma_0, 0, out_y.ptr);
-
-    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
-    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
-
-    // Read 2 pixel of the second line
-    uchar8 rgb_1 = vload8(0, in.ptr + input_stride_y);
-    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s4);
-    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s5);
-    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s6);
-
-    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
-    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
-    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
-
-    i_y = convert_short2_rtz(f_y);
-    i_u = convert_short2_rtz(f_u) + (short2)(128);
-    i_v = convert_short2_rtz(f_v) + (short2)(128);
-
-    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
-    vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
-
-    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
-    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
-    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
-                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
-    vstore2(cbcr, 0, out_uv.ptr);
-}
-
-/** Convert a RGBA image to IYUV using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
- * No offset.
- *
- * @param[in]  rgba_input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  rgba_input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  rgba_input_step_x                         rgba_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgba_input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  rgba_input_step_y                         rgba_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgba_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- *
- */
-__kernel void RGBA8888_to_IYUV_bt709(
-    IMAGE_DECLARATION(rgba_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    // handle 4 pixels every time, two lines, each line for 2 pixels
-    Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_input);
-    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u  = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v  = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // Read 2 pixel of the first line
-    uchar8 rgb_0 = vload8(0, in_rgb.ptr);
-    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s4);
-    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s5);
-    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s6);
-
-    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
-    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
-    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
-
-    short2 i_y = convert_short2_rtz(f_y);
-    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
-    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
-
-    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
-    vstore2(luma_0, 0, out_y.ptr);
-
-    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
-    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
-
-    // Read 2 pixel of the second line
-    uchar8 rgb_1 = vload8(0, in_rgb.ptr + rgba_input_stride_y);
-    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s4);
-    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s5);
-    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s6);
-
-    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
-    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
-    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
-
-    i_y = convert_short2_rtz(f_y);
-    i_u = convert_short2_rtz(f_u) + (short2)(128);
-    i_v = convert_short2_rtz(f_v) + (short2)(128);
-
-    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
-    vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
-
-    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
-    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
-    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
-                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
-    *out_u.ptr = cbcr.x;
-    *out_v.ptr = cbcr.y;
-}
-
-/** Convert an NV12 image to RGB8888
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  uv_input_ptr                             Pointer to the source uv channel. Supported Format: U8
- * @param[in]  uv_input_stride_x                        Stride of the source image uv channel in X dimension (in bytes)
- * @param[in]  uv_input_step_x                          uv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_input_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  uv_input_step_y                          uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_input_offset_first_element_in_bytes   The offset of the first element in the source image
- * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  rgb_output_step_y                        rgb_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void NV12_to_RGBA8888_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(uv_input),
-    IMAGE_DECLARATION(rgb_output))
-{
-    Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
-    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
-
-    uchar4 luma_0 = vload4(0, in_luma.ptr);
-    uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
-    uchar4 cbcr   = vload4(0, in_uv.ptr);
-    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
-    char4  cr     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
-
-    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
-    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
-    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
-
-    float4 f_r = convert_float4(luma_0) + temp0;
-    float4 f_g = convert_float4(luma_0) + temp1;
-    float4 f_b = convert_float4(luma_0) + temp2;
-
-    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
-    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
-    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
-
-    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
-    uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
-    vstore8(rgb_0, 0, out_rgb.ptr);
-    vstore8(rgb_1, 0, out_rgb.ptr + 8);
-
-    f_r = convert_float4(luma_1) + temp0;
-    f_g = convert_float4(luma_1) + temp1;
-    f_b = convert_float4(luma_1) + temp2;
-
-    r_0 = convert_uchar4_sat_rtz(f_r);
-    g_0 = convert_uchar4_sat_rtz(f_g);
-    b_0 = convert_uchar4_sat_rtz(f_b);
-
-    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
-    rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
-    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
-    vstore8(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
-}
-
-/** Convert an NV12 image to IYUV
- *
- * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
- * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
- * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- */
-__kernel void NV12_to_IYUV_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(uv_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
-    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // handle 32 pixels every time, two lines, each line for 16 pixels
-    uchar16 luma_0 = vload16(0, in_y.ptr);
-    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
-    uchar16 cbcr   = vload16(0, in_uv.ptr);
-    uchar8  cb     = (uchar8)(cbcr.s0, cbcr.s2, cbcr.s4, cbcr.s6, cbcr.s8, cbcr.sa, cbcr.sc, cbcr.se);
-    uchar8  cr     = (uchar8)(cbcr.s1, cbcr.s3, cbcr.s5, cbcr.s7, cbcr.s9, cbcr.sb, cbcr.sd, cbcr.sf);
-
-    vstore16(luma_0, 0, out_y.ptr);
-    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
-    vstore8(cb, 0, out_u.ptr);
-    vstore8(cr, 0, out_v.ptr);
-}
-
-/** Convert an NV12 image to YUV444
- *
- * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
- * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
- * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- */
-__kernel void NV12_to_YUV444_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(uv_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
-    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // handle 32 pixels every time, two lines, each line for 16 pixels
-    uchar16 luma_0 = vload16(0, in_y.ptr);
-    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
-    uchar16 cbcr   = vload16(0, in_uv.ptr);
-    uchar16 cb     = (uchar16)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2, cbcr.s4, cbcr.s4, cbcr.s6, cbcr.s6, cbcr.s8, cbcr.s8,
-                               cbcr.sa, cbcr.sa, cbcr.sc, cbcr.sc, cbcr.se, cbcr.se);
-    uchar16 cr = (uchar16)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3, cbcr.s5, cbcr.s5, cbcr.s7, cbcr.s7, cbcr.s9, cbcr.s9,
-                           cbcr.sb, cbcr.sb, cbcr.sd, cbcr.sd, cbcr.sf, cbcr.sf);
-
-    vstore16(luma_0, 0, out_y.ptr);
-    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
-    vstore16(cb, 0, out_u.ptr);
-    vstore16(cb, 0, out_u.ptr + u_output_stride_y);
-    vstore16(cr, 0, out_v.ptr);
-    vstore16(cr, 0, out_v.ptr + v_output_stride_y);
-}
-
-/** Convert an NV21 image to RGB888
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  uv_input_ptr                             Pointer to the source uv channel. Supported Format: U8
- * @param[in]  uv_input_stride_x                        Stride of the source image uv channel in X dimension (in bytes)
- * @param[in]  uv_input_step_x                          uv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_input_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  uv_input_step_y                          uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_input_offset_first_element_in_bytes   The offset of the first element in the source image
- * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  rgb_output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void NV21_to_RGB888_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(uv_input),
-    IMAGE_DECLARATION(rgb_output))
-{
-    Image in_y    = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
-    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
-
-    // handle 8 pixels every time, two lines, each line for 4 pixels
-    uchar4 luma_0 = vload4(0, in_y.ptr);
-    uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
-    uchar4 cbcr   = vload4(0, in_uv.ptr);
-    char4  cr     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
-    char4  cb     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
-
-    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
-    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
-    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
-
-    float4 f_r = convert_float4(luma_0) + temp0;
-    float4 f_g = convert_float4(luma_0) + temp1;
-    float4 f_b = convert_float4(luma_0) + temp2;
-
-    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
-    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
-    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
-
-    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
-    uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
-    vstore8(rgb_0, 0, out_rgb.ptr);
-    vstore4(rgb_1, 0, out_rgb.ptr + 8);
-
-    f_r = convert_float4(luma_1) + temp0;
-    f_g = convert_float4(luma_1) + temp1;
-    f_b = convert_float4(luma_1) + temp2;
-
-    r_0 = convert_uchar4_sat_rtz(f_r);
-    g_0 = convert_uchar4_sat_rtz(f_g);
-    b_0 = convert_uchar4_sat_rtz(f_b);
-
-    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
-    rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
-    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
-    vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
-}
-
-/** Convert an NV12 image to RGB8888
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
- * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
- * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] rgba_output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  rgba_output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  rgba_output_step_x                        rgba_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgba_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  rgba_output_step_y                        rgba_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgba_output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void NV21_to_RGBA8888_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(uv_input),
-    IMAGE_DECLARATION(rgba_output))
-{
-    Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
-    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_output);
-
-    // handle 8 pixels every time, two lines, each line for 4 pixels
-    uchar4 luma_0 = vload4(0, in_luma.ptr);
-    uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
-    uchar4 cbcr   = vload4(0, in_uv.ptr);
-    char4  cr     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
-    char4  cb     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
-
-    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
-    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
-    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
-
-    float4 f_r = convert_float4(luma_0) + temp0;
-    float4 f_g = convert_float4(luma_0) + temp1;
-    float4 f_b = convert_float4(luma_0) + temp2;
-
-    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
-    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
-    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
-
-    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
-    uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
-    vstore8(rgb_0, 0, out_rgb.ptr);
-    vstore8(rgb_1, 0, out_rgb.ptr + 8);
-
-    f_r = convert_float4(luma_1) + temp0;
-    f_g = convert_float4(luma_1) + temp1;
-    f_b = convert_float4(luma_1) + temp2;
-
-    r_0 = convert_uchar4_sat_rtz(f_r);
-    g_0 = convert_uchar4_sat_rtz(f_g);
-    b_0 = convert_uchar4_sat_rtz(f_b);
-
-    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
-    rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
-    vstore8(rgb_0, 0, out_rgb.ptr + rgba_output_stride_y);
-    vstore8(rgb_1, 0, out_rgb.ptr + rgba_output_stride_y + 8);
-}
-
-/** Convert an NV21 image to YUV444
- *
- * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
- * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
- * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- */
-__kernel void NV21_to_YUV444_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(uv_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
-    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // handle 32 pixels every time, two lines, each line for 16 pixels
-    uchar16 luma_0 = vload16(0, in_y.ptr);
-    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
-    uchar16 cbcr   = vload16(0, in_uv.ptr);
-    uchar16 cr     = (uchar16)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2, cbcr.s4, cbcr.s4, cbcr.s6, cbcr.s6, cbcr.s8, cbcr.s8,
-                               cbcr.sa, cbcr.sa, cbcr.sc, cbcr.sc, cbcr.se, cbcr.se);
-    uchar16 cb = (uchar16)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3, cbcr.s5, cbcr.s5, cbcr.s7, cbcr.s7, cbcr.s9, cbcr.s9,
-                           cbcr.sb, cbcr.sb, cbcr.sd, cbcr.sd, cbcr.sf, cbcr.sf);
-
-    vstore16(luma_0, 0, out_y.ptr);
-    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
-    vstore16(cb, 0, out_u.ptr);
-    vstore16(cb, 0, out_u.ptr + u_output_stride_y);
-    vstore16(cr, 0, out_v.ptr);
-    vstore16(cr, 0, out_v.ptr + v_output_stride_y);
-}
-
-/** Convert an NV21 image to IYUV
- *
- * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
- * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
- * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- */
-__kernel void NV21_to_IYUV_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(uv_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
-    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    uchar16 luma_0 = vload16(0, in_y.ptr);
-    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
-    uchar16 cbcr   = vload16(0, in_uv.ptr);
-    uchar8  cr     = (uchar8)(cbcr.s0, cbcr.s2, cbcr.s4, cbcr.s6, cbcr.s8, cbcr.sa, cbcr.sc, cbcr.se);
-    uchar8  cb     = (uchar8)(cbcr.s1, cbcr.s3, cbcr.s5, cbcr.s7, cbcr.s9, cbcr.sb, cbcr.sd, cbcr.sf);
-
-    vstore16(luma_0, 0, out_y.ptr);
-    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
-    vstore8(cb, 0, out_u.ptr);
-    vstore8(cr, 0, out_v.ptr);
-}
-
-/** Convert a UYVY image to IYUV using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
- * No offset.
- *
- * @param[in]  uyvy_input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  uyvy_input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  uyvy_input_step_x                         uyvy_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uyvy_input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  uyvy_input_step_y                         uyvy_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uyvy_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- *
- */
-__kernel void UYVY422_to_IYUV_bt709(
-    IMAGE_DECLARATION(uyvy_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    Image in_uyvy = CONVERT_TO_IMAGE_STRUCT(uyvy_input);
-    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u   = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v   = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // handle 16 pixels every time, each line 8 pixels
-    uchar16 uyvy = vload16(0, in_uyvy.ptr);
-    uchar8  luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
-    ushort4 cb_0 = (ushort4)(uyvy.s0, uyvy.s4, uyvy.s8, uyvy.sc);
-    ushort4 cr_0 = (ushort4)(uyvy.s2, uyvy.s6, uyvy.sa, uyvy.se);
-    vstore8(luma, 0, out_y.ptr);
-
-    uyvy         = vload16(0, in_uyvy.ptr + uyvy_input_stride_y);
-    luma         = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
-    ushort4 cb_1 = (ushort4)(uyvy.s0, uyvy.s4, uyvy.s8, uyvy.sc);
-    ushort4 cr_1 = (ushort4)(uyvy.s2, uyvy.s6, uyvy.sa, uyvy.se);
-    vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
-
-    uchar4 cb = convert_uchar4((cb_0 + cb_1) / (ushort4)(2));
-    uchar4 cr = convert_uchar4((cr_0 + cr_1) / (ushort4)(2));
-    vstore4(cb, 0, out_u.ptr);
-    vstore4(cr, 0, out_v.ptr);
-}
-
-/** Convert a YUYV image to IYUV using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
- * No offset.
- *
- * @param[in]  yuyv_input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  yuyv_input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  yuyv_input_step_x                         yuyv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  yuyv_input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  yuyv_input_step_y                         yuyv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  yuyv_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- *
- */
-__kernel void YUYV422_to_IYUV_bt709(
-    IMAGE_DECLARATION(yuyv_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    Image in_yuyv = CONVERT_TO_IMAGE_STRUCT(yuyv_input);
-    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u   = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v   = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // handle 16 pixels every time, each line 8 pixels
-    uchar16 yuyv = vload16(0, in_yuyv.ptr);
-    uchar8  luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
-    ushort4 cb_0 = (ushort4)(yuyv.s1, yuyv.s5, yuyv.s9, yuyv.sd);
-    ushort4 cr_0 = (ushort4)(yuyv.s3, yuyv.s7, yuyv.sb, yuyv.sf);
-    vstore8(luma, 0, out_y.ptr);
-
-    yuyv         = vload16(0, in_yuyv.ptr + yuyv_input_stride_y);
-    luma         = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
-    ushort4 cb_1 = (ushort4)(yuyv.s1, yuyv.s5, yuyv.s9, yuyv.sd);
-    ushort4 cr_1 = (ushort4)(yuyv.s3, yuyv.s7, yuyv.sb, yuyv.sf);
-    vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
-
-    uchar4 cb = convert_uchar4((cb_0 + cb_1) / (ushort4)(2));
-    uchar4 cr = convert_uchar4((cr_0 + cr_1) / (ushort4)(2));
-    vstore4(cb, 0, out_u.ptr);
-    vstore4(cr, 0, out_v.ptr);
-}
-
-/** Convert an IYUV image to RGB888
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  u_input_ptr                              Pointer to the source U channel. Supported Format: U8
- * @param[in]  u_input_stride_x                         Stride of the source image U channel in X dimension (in bytes)
- * @param[in]  u_input_step_x                           u_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_input_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  u_input_step_y                           u_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_input_offset_first_element_in_bytes    The offset of the first element in the source U channel
- * @param[in]  v_input_ptr                              Pointer to the source V channel. Supported Format: U8
- * @param[in]  v_input_stride_x                         Stride of the source image V channel in X dimension (in bytes)
- * @param[in]  v_input_step_x                           v_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_input_stride_y                         Stride of the source image V channel in Y dimension (in bytes)
- * @param[in]  v_input_step_y                           v_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_input_offset_first_element_in_bytes    The offset of the first element in the source image V channel
- * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  rgb_output_step_y                        rgb_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void IYUV_to_RGB888_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(u_input),
-    IMAGE_DECLARATION(v_input),
-    IMAGE_DECLARATION(rgb_output))
-{
-    Image in_y    = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_u    = CONVERT_TO_IMAGE_STRUCT(u_input);
-    Image in_v    = CONVERT_TO_IMAGE_STRUCT(v_input);
-    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
-
-    // handle 8 pixels every time, two lines, each line for 4 pixels
-    uchar4 luma_0 = vload4(0, in_y.ptr);
-    uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
-    uchar4 cbcr   = (uchar4)(vload2(0, in_u.ptr), vload2(0, in_v.ptr));
-    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s1, cbcr.s1) - (char4)(128);
-    char4  cr     = (char4)(cbcr.s2, cbcr.s2, cbcr.s3, cbcr.s3) - (char4)(128);
-
-    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
-    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
-    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
-
-    float4 f_r = convert_float4(luma_0) + temp0;
-    float4 f_g = convert_float4(luma_0) + temp1;
-    float4 f_b = convert_float4(luma_0) + temp2;
-
-    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
-    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
-    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
-
-    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
-    uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
-    vstore8(rgb_0, 0, out_rgb.ptr);
-    vstore4(rgb_1, 0, out_rgb.ptr + 8);
-
-    f_r = convert_float4(luma_1) + temp0;
-    f_g = convert_float4(luma_1) + temp1;
-    f_b = convert_float4(luma_1) + temp2;
-
-    r_0 = convert_uchar4_sat_rtz(f_r);
-    g_0 = convert_uchar4_sat_rtz(f_g);
-    b_0 = convert_uchar4_sat_rtz(f_b);
-
-    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
-    rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
-    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
-    vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
-}
-
-/** Convert an IYUV image to RGB8888
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  u_input_ptr                               Pointer to the source U channel. Supported Format: U8
- * @param[in]  u_input_stride_x                          Stride of the source image U channel in X dimension (in bytes)
- * @param[in]  u_input_step_x                            u_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_input_stride_y                          Stride of the source image in Y dimension (in bytes)
- * @param[in]  u_input_step_y                            u_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_input_offset_first_element_in_bytes     The offset of the first element in the source U channel
- * @param[in]  v_input_ptr                               Pointer to the source V channel. Supported Format: U8
- * @param[in]  v_input_stride_x                          Stride of the source image V channel in X dimension (in bytes)
- * @param[in]  v_input_step_x                            v_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_input_stride_y                          Stride of the source image V channel in Y dimension (in bytes)
- * @param[in]  v_input_step_y                            v_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_input_offset_first_element_in_bytes     The offset of the first element in the source image V channel
- * @param[out] rgba_output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  rgba_output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  rgba_output_step_x                        rgba_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgba_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  rgba_output_step_y                        rgba_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgba_output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void IYUV_to_RGBA8888_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(u_input),
-    IMAGE_DECLARATION(v_input),
-    IMAGE_DECLARATION(rgba_output))
-{
-    Image in_y    = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_u    = CONVERT_TO_IMAGE_STRUCT(u_input);
-    Image in_v    = CONVERT_TO_IMAGE_STRUCT(v_input);
-    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_output);
-
-    // handle 8 pixels every time, two lines, each line for 4 pixels
-    uchar4 luma_0 = vload4(0, in_y.ptr);
-    uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
-    uchar4 cbcr   = (uchar4)(vload2(0, in_u.ptr), vload2(0, in_v.ptr));
-    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s1, cbcr.s1) - (char4)(128);
-    char4  cr     = (char4)(cbcr.s2, cbcr.s2, cbcr.s3, cbcr.s3) - (char4)(128);
-
-    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
-    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
-    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
-
-    float4 f_r = convert_float4(luma_0) + temp0;
-    float4 f_g = convert_float4(luma_0) + temp1;
-    float4 f_b = convert_float4(luma_0) + temp2;
-
-    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
-    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
-    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
-
-    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
-    uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
-    vstore8(rgb_0, 0, out_rgb.ptr);
-    vstore8(rgb_1, 0, out_rgb.ptr + 8);
-
-    f_r = convert_float4(luma_1) + temp0;
-    f_g = convert_float4(luma_1) + temp1;
-    f_b = convert_float4(luma_1) + temp2;
-
-    r_0 = convert_uchar4_sat_rtz(f_r);
-    g_0 = convert_uchar4_sat_rtz(f_g);
-    b_0 = convert_uchar4_sat_rtz(f_b);
-
-    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
-    rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
-    vstore8(rgb_0, 0, out_rgb.ptr + rgba_output_stride_y);
-    vstore8(rgb_1, 0, out_rgb.ptr + rgba_output_stride_y + 8);
-}
-
-/** Convert an IYUV image to YUV444
- *
- * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  u_input_ptr                               Pointer to the source U channel. Supported Format: U8
- * @param[in]  u_input_stride_x                          Stride of the source image U channel in X dimension (in bytes)
- * @param[in]  u_input_step_x                            u_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_input_stride_y                          Stride of the source image in Y dimension (in bytes)
- * @param[in]  u_input_step_y                            u_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_input_offset_first_element_in_bytes     The offset of the first element in the source U channel
- * @param[in]  v_input_ptr                               Pointer to the source V channel. Supported Format: U8
- * @param[in]  v_input_stride_x                          Stride of the source image V channel in X dimension (in bytes)
- * @param[in]  v_input_step_x                            v_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_input_stride_y                          Stride of the source image V channel in Y dimension (in bytes)
- * @param[in]  v_input_step_y                            v_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_input_offset_first_element_in_bytes     The offset of the first element in the source image V channel
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- *
- */
-__kernel void IYUV_to_YUV444_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(u_input),
-    IMAGE_DECLARATION(v_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_u  = CONVERT_TO_IMAGE_STRUCT(u_input);
-    Image in_v  = CONVERT_TO_IMAGE_STRUCT(v_input);
-    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // handle 32 pixels every time, two lines, each line for 16 pixels
-    uchar16 luma_0 = vload16(0, in_y.ptr);
-    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
-    uchar8  cb_src = vload8(0, in_u.ptr);
-    uchar8  cr_src = vload8(0, in_v.ptr);
-    uchar16 cb     = (uchar16)(cb_src.s0, cb_src.s0, cb_src.s1, cb_src.s1, cb_src.s2, cb_src.s2, cb_src.s3, cb_src.s3,
-                               cb_src.s4, cb_src.s4, cb_src.s5, cb_src.s5, cb_src.s6, cb_src.s6, cb_src.s7, cb_src.s7);
-    uchar16 cr = (uchar16)(cr_src.s0, cr_src.s0, cr_src.s1, cr_src.s1, cr_src.s2, cr_src.s2, cr_src.s3, cr_src.s3,
-                           cr_src.s4, cr_src.s4, cr_src.s5, cr_src.s5, cr_src.s6, cr_src.s6, cr_src.s7, cr_src.s7);
-
-    vstore16(luma_0, 0, out_y.ptr);
-    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
-    vstore16(cb, 0, out_u.ptr);
-    vstore16(cb, 0, out_u.ptr + u_output_stride_y);
-    vstore16(cr, 0, out_v.ptr);
-    vstore16(cr, 0, out_v.ptr + v_output_stride_y);
-}
-
-/** Convert an IYUV image to NV12
- *
- * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  u_input_ptr                               Pointer to the source U channel. Supported Format: U8
- * @param[in]  u_input_stride_x                          Stride of the source image U channel in X dimension (in bytes)
- * @param[in]  u_input_step_x                            u_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_input_stride_y                          Stride of the source image in Y dimension (in bytes)
- * @param[in]  u_input_step_y                            u_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_input_offset_first_element_in_bytes     The offset of the first element in the source U channel
- * @param[in]  v_input_ptr                               Pointer to the source V channel. Supported Format: U8
- * @param[in]  v_input_stride_x                          Stride of the source image V channel in X dimension (in bytes)
- * @param[in]  v_input_step_x                            v_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_input_stride_y                          Stride of the source image V channel in Y dimension (in bytes)
- * @param[in]  v_input_step_y                            v_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_input_offset_first_element_in_bytes     The offset of the first element in the source image V channel
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] uv_output_ptr                             Pointer to the destination UV channel. Supported Format: U8
- * @param[in]  uv_output_stride_x                        Stride of the destination UV channel in X dimension (in bytes)
- * @param[in]  uv_output_step_x                          uv_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_output_stride_y                        Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  uv_output_step_y                          uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_output_offset_first_element_in_bytes   The offset of the first element in the destination UV channel
- *
- */
-__kernel void IYUV_to_NV12_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(u_input),
-    IMAGE_DECLARATION(v_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(uv_output))
-{
-    Image in_y   = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_u   = CONVERT_TO_IMAGE_STRUCT(u_input);
-    Image in_v   = CONVERT_TO_IMAGE_STRUCT(v_input);
-    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output);
-
-    // handle 32 pixels every time, two lines, each line for 16 pixels
-    uchar16 luma_0 = vload16(0, in_y.ptr);
-    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
-    uchar8  cb     = vload8(0, in_u.ptr);
-    uchar8  cr     = vload8(0, in_v.ptr);
-    uchar16 cbcr   = (uchar16)(cb.s0, cr.s0, cb.s1, cr.s1, cb.s2, cr.s2, cb.s3, cr.s3, cb.s4, cr.s4, cb.s5, cr.s5, cb.s6,
-                               cr.s6, cb.s7, cr.s7);
-
-    vstore16(luma_0, 0, out_y.ptr);
-    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
-    vstore16(cbcr, 0, out_uv.ptr);
-}
-
-/** Convert a YUYV image to NV12 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
- * No offset.
- *
- * @param[in]  yuyv_input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  yuyv_input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  yuyv_input_step_x                         yuyv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  yuyv_input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  yuyv_input_step_y                         yuyv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  yuyv_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] uv_output_ptr                             Pointer to the destination UV channel. Supported Format: U8
- * @param[in]  uv_output_stride_x                        Stride of the destination UV channel in X dimension (in bytes)
- * @param[in]  uv_output_step_x                          uv_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_output_stride_y                        Stride of the destination image UV channel in Y dimension (in bytes)
- * @param[in]  uv_output_step_y                          uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_output_offset_first_element_in_bytes   The offset of the first element in the destination UV channel
- *
- */
-__kernel void YUYV422_to_NV12_bt709(
-    IMAGE_DECLARATION(yuyv_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(uv_output))
-{
-    Image in_yuyv = CONVERT_TO_IMAGE_STRUCT(yuyv_input);
-    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_uv  = CONVERT_TO_IMAGE_STRUCT(uv_output);
-
-    // handle 16 pixels every time, each line 8 pixels
-    uchar16 yuyv   = vload16(0, in_yuyv.ptr);
-    ushort8 cbcr_0 = (ushort8)(yuyv.s1, yuyv.s3, yuyv.s5, yuyv.s7, yuyv.s9, yuyv.sb, yuyv.sd, yuyv.sf);
-    uchar8  luma   = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
-    vstore8(luma, 0, out_y.ptr);
-
-    yuyv           = vload16(0, in_yuyv.ptr + yuyv_input_stride_y);
-    ushort8 cbcr_1 = (ushort8)(yuyv.s1, yuyv.s3, yuyv.s5, yuyv.s7, yuyv.s9, yuyv.sb, yuyv.sd, yuyv.sf);
-    luma           = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
-    vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
-
-    uchar8 cbcr = convert_uchar8((cbcr_0 + cbcr_1) / (ushort8)(2));
-    vstore8(cbcr, 0, out_uv.ptr);
-}
-
-/** Convert a UYVY image to NV12 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  input_uyvy_ptr                           Pointer to the source image. Supported Format: U8
- * @param[in]  input_uyvy_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  input_uyvy_step_x                        input_uyvy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_uyvy_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_uyvy_step_y                        input_uyvy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_uyvy_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] luma_ptr                                 Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_stride_x                            Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_step_x                              luma_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_stride_y                            Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_step_y                              luma_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_offset_first_element_in_bytes       The offset of the first element in the destination image luma channel
- * @param[out] uv_ptr                                   Pointer to the destination uv channel. Supported Format: U8
- * @param[in]  uv_stride_x                              Stride of the destination uv channel in X dimension (in bytes)
- * @param[in]  uv_step_x                                uv_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_stride_y                              Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  uv_step_y                                uv_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_offset_first_element_in_bytes         The offset of the first element in the destination image uv channel
- *
- */
-__kernel void UYVY422_to_NV12_bt709(
-    IMAGE_DECLARATION(input_uyvy),
-    IMAGE_DECLARATION(luma),
-    IMAGE_DECLARATION(uv))
-{
-    Image in     = CONVERT_TO_IMAGE_STRUCT(input_uyvy);
-    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma);
-    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv);
-
-    // handle 16 pixels every time, each line 8 pixels
-    const uchar16 uyvy_t = vload16(0, in.ptr);
-    vstore8(uyvy_t.s13579bdf, 0, out_y.ptr);
-
-    const uchar16 uyvy_b = vload16(0, in.ptr + input_uyvy_stride_y);
-    vstore8(uyvy_b.s13579bdf, 0, out_y.ptr + luma_stride_y);
-
-    const ushort8 cbcr_t = (ushort8)(uyvy_t.s0, uyvy_t.s2, uyvy_t.s4, uyvy_t.s6, uyvy_t.s8, uyvy_t.sa, uyvy_t.sc, uyvy_t.se);
-    const ushort8 cbcr_b = (ushort8)(uyvy_b.s0, uyvy_b.s2, uyvy_b.s4, uyvy_b.s6, uyvy_b.s8, uyvy_b.sa, uyvy_b.sc, uyvy_b.se);
-    const uchar8  cbcr   = convert_uchar8((cbcr_t + cbcr_b) / (ushort8)(2));
-    vstore8(cbcr, 0, out_uv.ptr);
-}

diff --git a/src/core/CL/cl_kernels/convolution3x3.cl b/src/core/CL/cl_kernels/convolution3x3.cl
deleted file mode 100644
index 7bca567..0000000
--- a/src/core/CL/cl_kernels/convolution3x3.cl
+++ /dev/null

@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2016-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#ifndef DATA_TYPE
-#define DATA_TYPE short
-#endif /* DATA_TYPE */
-
-#ifndef DATA_TYPE_OUT
-#define DATA_TYPE_OUT uchar
-#endif /* DATA_TYPE_OUT */
-
-/** Compute a 1D horizontal convolution of size 3 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] left_pixel   Pointer to the left pixel.
- * @param[in] left_coeff   Weight of the left pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] right_coeff  Weight of the right pixel
- *
- * @return a short8 containing 8 convoluted values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) convolution1x3(__global const uchar *left_pixel,
-                                                  const short left_coeff,
-                                                  const short middle_coeff,
-                                                  const short right_coeff)
-{
-    uchar16 temp = vload16(0, left_pixel);
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    middle = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
-
-    return left * (VEC_DATA_TYPE(DATA_TYPE, 8))left_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right * (VEC_DATA_TYPE(DATA_TYPE, 8))right_coeff;
-}
-
-/** Apply a 3x3 convolution matrix to a single channel U8 input image and return the result.
- *
- * Convolution matrix layout:
- *
- * [ mat0, mat1, mat2 ]\n
- * [ mat3, mat4, mat5 ]\n
- * [ mat6, mat7, mat8 ]\n
- *
- * @param[in] src   A pointer to source Image structure
- * @param[in] mat0  Coefficient from the convolution matrix
- * @param[in] mat1  Coefficient from the convolution matrix
- * @param[in] mat2  Coefficient from the convolution matrix
- * @param[in] mat3  Coefficient from the convolution matrix
- * @param[in] mat4  Coefficient from the convolution matrix
- * @param[in] mat5  Coefficient from the convolution matrix
- * @param[in] mat6  Coefficient from the convolution matrix
- * @param[in] mat7  Coefficient from the convolution matrix
- * @param[in] mat8  Coefficient from the convolution matrix
- * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
- *
- * @return a short8 containing 8 convoluted and scaled values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) convolution3x3(
-    Image      *src,
-    const short mat0, const short mat1, const short mat2,
-    const short mat3, const short mat4, const short mat5,
-    const short mat6, const short mat7, const short mat8, uint scale)
-{
-    // Output pixels
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    pixels;
-
-    // Row 0
-    pixels = convolution1x3(offset(src, -1, -1), mat0, mat1, mat2);
-    // Row
-    pixels += convolution1x3(offset(src, -1, 0), mat3, mat4, mat5);
-    // Row 2
-    pixels += convolution1x3(offset(src, -1, 1), mat6, mat7, mat8);
-
-    // Divide by the scale
-    return pixels / (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
-}
-
-#ifndef DYNAMIC_MATRIX_CONVOLUTION
-
-/** Apply a 3x3 static convolution matrix to a single channel U8 input image and output a single channel image.
- *
- * @attention The matrix coefficients(MAT0, MAT1, ... MAT8, SCALE), DATA_TYPE, and DATA_TYPE_OUT need to be passed at compile time.\n
- * e.g. -DMAT0=1 -DMAT2=2, ...-DMAT8=8, -DSCALE=1, -DDATA_TYPE=int, -DDATA_TYPE_OUT=int
- *
- * @param[in]  src_ptr                           Pointer to the source image
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution3x3_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    pixels = convolution3x3(&src,
-                            MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, SCALE);
-
-    // Store the result as is in dst
-    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
-}
-
-#endif // DYNAMIC_MATRIX_CONVOLUTION

diff --git a/src/core/CL/cl_kernels/convolution5x5.cl b/src/core/CL/cl_kernels/convolution5x5.cl
deleted file mode 100644
index 9995ebf..0000000
--- a/src/core/CL/cl_kernels/convolution5x5.cl
+++ /dev/null

@@ -1,287 +0,0 @@
-/*
- * Copyright (c) 2016-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#ifndef DATA_TYPE
-#define DATA_TYPE short
-#endif /* DATA_TYPE */
-
-#ifndef COMPUTE_TYPE
-#define COMPUTE_TYPE int
-#endif /* COMPUTE_TYPE */
-
-#ifndef DATA_TYPE_OUT
-#define DATA_TYPE_OUT uchar
-#endif /* DATA_TYPE_OUT */
-
-/** Compute a 1D horizontal convolution of size 5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] left_pixel   Pointer to the left pixel
- * @param[in] left1_coeff  Weight of the most left pixel
- * @param[in] left2_coeff  Weight of the left pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] right1_coeff Weight of the right pixel
- * @param[in] right2_coeff Weight of the most right pixel
- *
- * @return a short8 containing 8 convoluted values.
- */
-VEC_DATA_TYPE(DATA_TYPE, 8)
-convolution1x5(
-    __global const uchar *left_pixel,
-    const short           left1_coeff,
-    const short           left2_coeff,
-    const short           middle_coeff,
-    const short           right1_coeff,
-    const short           right2_coeff)
-{
-    uchar16 temp = vload16(0, left_pixel);
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    middle = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right1 = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right2 = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8));
-
-    return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff
-           + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE, 8))right2_coeff;
-}
-
-/** Compute a 1D vertical convolution of size 5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] src          Pointer to source image.
- * @param[in] up1_coeff    Weight of the most up pixel
- * @param[in] up2_coeff    Weight of the up pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] down1_coeff  Weight of the down pixel
- * @param[in] down2_coeff  Weight of the most down pixel
- *
- * @return a short8 containing 8 convoluted values.
- */
-VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-convolution5x1(
-    Image      *src,
-    const short up1_coeff,
-    const short up2_coeff,
-    const short middle_coeff,
-    const short down1_coeff,
-    const short down2_coeff)
-{
-    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-    val;
-    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-    out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff;
-
-    return out;
-}
-
-/** Apply a 5x5 convolution matrix to a single channel U8 input image and return the result.
- *
- * Convolution matrix layout:\n
- * [  mat0,  mat1,  mat2,  mat3 , mat4 ]\n
- * [  mat5,  mat6,  mat7,  mat8,  mat9 ]\n
- * [ mat10, mat11, mat12, mat13, mat14 ]\n
- * [ mat15, mat16, mat17, mat18, mat19 ]\n
- * [ mat20, mat21, mat22, mat23, mat24 ]
- *
- * @param[in] src   A pointer to source Image structure.
- * @param[in] mat0  Coefficient from the convolution matrix
- * @param[in] mat1  Coefficient from the convolution matrix
- * @param[in] mat2  Coefficient from the convolution matrix
- * @param[in] mat3  Coefficient from the convolution matrix
- * @param[in] mat4  Coefficient from the convolution matrix
- * @param[in] mat5  Coefficient from the convolution matrix
- * @param[in] mat6  Coefficient from the convolution matrix
- * @param[in] mat7  Coefficient from the convolution matrix
- * @param[in] mat8  Coefficient from the convolution matrix
- * @param[in] mat9  Coefficient from the convolution matrix
- * @param[in] mat10 Coefficient from the convolution matrix
- * @param[in] mat11 Coefficient from the convolution matrix
- * @param[in] mat12 Coefficient from the convolution matrix
- * @param[in] mat13 Coefficient from the convolution matrix
- * @param[in] mat14 Coefficient from the convolution matrix
- * @param[in] mat15 Coefficient from the convolution matrix
- * @param[in] mat16 Coefficient from the convolution matrix
- * @param[in] mat17 Coefficient from the convolution matrix
- * @param[in] mat18 Coefficient from the convolution matrix
- * @param[in] mat19 Coefficient from the convolution matrix
- * @param[in] mat20 Coefficient from the convolution matrix
- * @param[in] mat21 Coefficient from the convolution matrix
- * @param[in] mat22 Coefficient from the convolution matrix
- * @param[in] mat23 Coefficient from the convolution matrix
- * @param[in] mat24 Coefficient from the convolution matrix
- * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
- *
- * @return a short8 containing 8 convoluted and scaled values.
- */
-short8 convolution5x5(
-    Image      *src,
-    const short mat0, const short mat1, const short mat2, const short mat3, const short mat4,
-    const short mat5, const short mat6, const short mat7, const short mat8, const short mat9,
-    const short mat10, const short mat11, const short mat12, const short mat13, const short mat14,
-    const short mat15, const short mat16, const short mat17, const short mat18, const short mat19,
-    const short mat20, const short mat21, const short mat22, const short mat23, const short mat24,
-    uint scale)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    pixels;
-
-    pixels = convolution1x5(offset(src, -2, -2), mat0, mat1, mat2, mat3, mat4);
-    pixels += convolution1x5(offset(src, -2, -1), mat5, mat6, mat7, mat8, mat9);
-    pixels += convolution1x5(offset(src, -2, 0), mat10, mat11, mat12, mat13, mat14);
-    pixels += convolution1x5(offset(src, -2, 1), mat15, mat16, mat17, mat18, mat19);
-    pixels += convolution1x5(offset(src, -2, 2), mat20, mat21, mat22, mat23, mat24);
-
-    if(scale > 0)
-    {
-        pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
-    }
-
-    return convert_short8_sat(pixels);
-}
-
-#ifndef DYNAMIC_MATRIX_CONVOLUTION
-
-/** Apply a 1x5 static convolution matrix to a single channel U8 input image and output a single temporary channel image(Support U16, S16, S32).
- *
- * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4) and DATA_TYPE need to be passed at compile time:\n
- * e.g. -DMAT0=1 -DMAT2=2, -DMAT3=3, -DMAT4=4, -DDATA_TYPE=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16, S16, S32
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution_separable1x5_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Output pixels
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    pixels = convolution1x5(offset(&src, -2, 0), MAT0, MAT1, MAT2, MAT3, MAT4);
-
-    // Store result in dst
-    vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr);
-}
-
-/** Apply a 5x1 static convolution matrix to a single channel U8 input image and output a single channel image.
- *
- * @attention The matrix coefficients (MAT5, MAT6, MAT7, MAT8, MAT9, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n
- * e.g. -DMAT5=1 -DMAT6=2, -DMAT7=3, -DMAT8=4, -DMAT9=5, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16, S16, S32
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution_separable5x1_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Output pixels
-    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-    pixels = convolution5x1(&src, MAT5, MAT6, MAT7, MAT8, MAT9);
-
-    // Divide by the scale
-    pixels /= (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE;
-
-    // Store result in dst
-    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
-}
-
-/** Apply a static 5x5 convolution matrix to a single channel U8 input image and output a single channel image including borders
- *
- * @attention The matrix coefficients(MAT0, MAT1, ... MAT24, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n
- * e.g. -DMAT0=1 -DMAT1=2, ... -DMAT24=24, -DSCALE=6, -DDATA_TYPE_OUT=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution5x5_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    short8 pixels = convolution5x5(&src,
-                                   MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13,
-                                   MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, SCALE);
-
-    // Store the result as is in dst
-    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
-}
-
-#endif // DYNAMIC_MATRIX_CONVOLUTION

diff --git a/src/core/CL/cl_kernels/convolution7x7.cl b/src/core/CL/cl_kernels/convolution7x7.cl
deleted file mode 100644
index 50fb3d7..0000000
--- a/src/core/CL/cl_kernels/convolution7x7.cl
+++ /dev/null

@@ -1,338 +0,0 @@
-/*
- * Copyright (c) 2016-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#ifndef DATA_TYPE
-#define DATA_TYPE short
-#endif /* DATA_TYPE */
-
-#ifndef COMPUTE_TYPE
-#define COMPUTE_TYPE int
-#endif /* COMPUTE_TYPE */
-
-#ifndef DATA_TYPE_OUT
-#define DATA_TYPE_OUT uchar
-#endif /* DATA_TYPE_OUT */
-
-/** Compute a 1D horizontal convolution of size 7 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] left_pixel   Pointer to the left pixel
- * @param[in] left1_coeff  Weight of the most left pixel
- * @param[in] left2_coeff  Weight of the second left pixel
- * @param[in] left3_coeff  Weight of the left pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] right1_coeff Weight of the right pixel
- * @param[in] right2_coeff Weight of the second right pixel
- * @param[in] right3_coeff Weight of the most right pixel
- *
- * @return a short8 containing 8 convoluted values.
- */
-VEC_DATA_TYPE(DATA_TYPE, 8)
-convolution1x7(
-    __global const uchar *left_pixel,
-    const short           left1_coeff,
-    const short           left2_coeff,
-    const short           left3_coeff,
-    const short           middle_coeff,
-    const short           right1_coeff,
-    const short           right2_coeff,
-    const short           right3_coeff)
-{
-    uchar16 temp = vload16(0, left_pixel);
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left3 = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    middle = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right1 = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right2 = CONVERT(temp.s56789abc, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right3 = CONVERT(temp.s6789abcd, VEC_DATA_TYPE(DATA_TYPE, 8));
-
-    return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff + left3 * (VEC_DATA_TYPE(DATA_TYPE, 8))left3_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE,
-            8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE, 8))right2_coeff + right3 * (VEC_DATA_TYPE(DATA_TYPE, 8))right3_coeff;
-}
-
-/** Compute a 1D vertical convolution of size 7 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] src          Pointer to source image.
- * @param[in] up1_coeff    Weight of the most up pixel
- * @param[in] up2_coeff    Weight of the second up pixel
- * @param[in] up3_coeff    Weight of the up pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] down1_coeff  Weight of the down pixel
- * @param[in] down2_coeff  Weight of the second down pixel
- * @param[in] down3_coeff  Weight of the third down pixel
- *
- * @return a short8 containing 8 convoluted values.
- */
-VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-convolution7x1(
-    Image      *src,
-    const short up1_coeff,
-    const short up2_coeff,
-    const short up3_coeff,
-    const short middle_coeff,
-    const short down1_coeff,
-    const short down2_coeff,
-    const short down3_coeff)
-{
-    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-    val;
-    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-    out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up3_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down3_coeff;
-
-    return out;
-}
-
-/** Apply a 7x7 convolution matrix to a single channel U8 input image and return the result.
- *
- * Convolution matrix layout:\n
- * [  mat0,  mat1,  mat2,  mat3 , mat4,  mat5,  mat6 ]\n
- * [  mat7,  mat8,  mat9,  mat10, mat11, mat12, mat13 ]\n
- * [  mat14, mat15, mat16, mat17, mat18, mat19, mat20 ]\n
- * [  mat21, mat22, mat23, mat24, mat25, mat26, mat27 ]\n
- * [  mat28, mat29, mat30, mat31, mat32, mat33, mat34 ]\n
- * [  mat35, mat36, mat37, mat38, mat39, mat40, mat41 ]\n
- * [  mat42, mat43, mat44, mat45, mat46, mat47, mat48 ]
- *
- * @param[in] src   A pointer to source Image structure.
- * @param[in] mat0  Coefficient from the convolution matrix
- * @param[in] mat1  Coefficient from the convolution matrix
- * @param[in] mat2  Coefficient from the convolution matrix
- * @param[in] mat3  Coefficient from the convolution matrix
- * @param[in] mat4  Coefficient from the convolution matrix
- * @param[in] mat5  Coefficient from the convolution matrix
- * @param[in] mat6  Coefficient from the convolution matrix
- * @param[in] mat7  Coefficient from the convolution matrix
- * @param[in] mat8  Coefficient from the convolution matrix
- * @param[in] mat9  Coefficient from the convolution matrix
- * @param[in] mat10 Coefficient from the convolution matrix
- * @param[in] mat11 Coefficient from the convolution matrix
- * @param[in] mat12 Coefficient from the convolution matrix
- * @param[in] mat13 Coefficient from the convolution matrix
- * @param[in] mat14 Coefficient from the convolution matrix
- * @param[in] mat15 Coefficient from the convolution matrix
- * @param[in] mat16 Coefficient from the convolution matrix
- * @param[in] mat17 Coefficient from the convolution matrix
- * @param[in] mat18 Coefficient from the convolution matrix
- * @param[in] mat19 Coefficient from the convolution matrix
- * @param[in] mat20 Coefficient from the convolution matrix
- * @param[in] mat21 Coefficient from the convolution matrix
- * @param[in] mat22 Coefficient from the convolution matrix
- * @param[in] mat23 Coefficient from the convolution matrix
- * @param[in] mat24 Coefficient from the convolution matrix
- * @param[in] mat25 Coefficient from the convolution matrix
- * @param[in] mat26 Coefficient from the convolution matrix
- * @param[in] mat27 Coefficient from the convolution matrix
- * @param[in] mat28 Coefficient from the convolution matrix
- * @param[in] mat29 Coefficient from the convolution matrix
- * @param[in] mat30 Coefficient from the convolution matrix
- * @param[in] mat31 Coefficient from the convolution matrix
- * @param[in] mat32 Coefficient from the convolution matrix
- * @param[in] mat33 Coefficient from the convolution matrix
- * @param[in] mat34 Coefficient from the convolution matrix
- * @param[in] mat35 Coefficient from the convolution matrix
- * @param[in] mat36 Coefficient from the convolution matrix
- * @param[in] mat37 Coefficient from the convolution matrix
- * @param[in] mat38 Coefficient from the convolution matrix
- * @param[in] mat39 Coefficient from the convolution matrix
- * @param[in] mat40 Coefficient from the convolution matrix
- * @param[in] mat41 Coefficient from the convolution matrix
- * @param[in] mat42 Coefficient from the convolution matrix
- * @param[in] mat43 Coefficient from the convolution matrix
- * @param[in] mat44 Coefficient from the convolution matrix
- * @param[in] mat45 Coefficient from the convolution matrix
- * @param[in] mat46 Coefficient from the convolution matrix
- * @param[in] mat47 Coefficient from the convolution matrix
- * @param[in] mat48 Coefficient from the convolution matrix
- * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
- *
- */
-short8 convolution7x7(
-    Image      *src,
-    const short mat0, const short mat1, const short mat2, const short mat3, const short mat4,
-    const short mat5, const short mat6, const short mat7, const short mat8, const short mat9,
-    const short mat10, const short mat11, const short mat12, const short mat13, const short mat14,
-    const short mat15, const short mat16, const short mat17, const short mat18, const short mat19,
-    const short mat20, const short mat21, const short mat22, const short mat23, const short mat24,
-    const short mat25, const short mat26, const short mat27, const short mat28, const short mat29,
-    const short mat30, const short mat31, const short mat32, const short mat33, const short mat34,
-    const short mat35, const short mat36, const short mat37, const short mat38, const short mat39,
-    const short mat40, const short mat41, const short mat42, const short mat43, const short mat44,
-    const short mat45, const short mat46, const short mat47, const short mat48, uint scale)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    pixels;
-
-    pixels = convolution1x7(offset(src, -3, -3), mat0, mat1, mat2, mat3, mat4, mat5, mat6);
-    pixels += convolution1x7(offset(src, -3, -2), mat7, mat8, mat9, mat10, mat11, mat12, mat13);
-    pixels += convolution1x7(offset(src, -3, -1), mat14, mat15, mat16, mat17, mat18, mat19, mat20);
-    pixels += convolution1x7(offset(src, -3, 0), mat21, mat22, mat23, mat24, mat25, mat26, mat27);
-    pixels += convolution1x7(offset(src, -3, 1), mat28, mat29, mat30, mat31, mat32, mat33, mat34);
-    pixels += convolution1x7(offset(src, -3, 2), mat35, mat36, mat37, mat38, mat39, mat40, mat41);
-    pixels += convolution1x7(offset(src, -3, 3), mat42, mat43, mat44, mat45, mat46, mat47, mat48);
-
-    if(scale > 0)
-    {
-        pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
-    }
-
-    return convert_short8_sat(pixels);
-}
-
-#ifndef DYNAMIC_MATRIX_CONVOLUTION
-
-/** Apply a 1x7 static convolution matrix to a single channel U8 input image and output a single temporary channel image.
- *
- * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6) and DATA_TYPE need to be passed at compile time:\n
- * e.g. -DMAT0=1 -DMAT1=2, ... -DMAT6=6, -DDATA_TYPE=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16, S16, S32
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution_separable1x7_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Output pixels
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    pixels = convolution1x7(offset(&src, -3, 0), MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6);
-
-    // Store result in dst
-    vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr);
-}
-
-/** Apply a 7x1 static convolution matrix to a single channel U8 input image and output a single channel image.
- *
- * @attention The matrix coefficients (MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n
- * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT24=13, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16, S16, S32
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution_separable7x1_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Output pixels
-    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-    pixels = convolution7x1(&src, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13);
-
-    // Divide by the scale
-    pixels /= (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE;
-
-    // Store result in dst
-    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
-}
-
-/** Apply a static 7x7 convolution matrix to a single channel U8 input image and output a single channel U8 image including the borders.
- *
- * @attention The matrix coefficients(MAT0, MAT1, ... MAT48, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n
- * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT48=48, -DSCALE=6, -DDATA_TYPE_OUT=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution7x7_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    short8 pixels = convolution7x7(&src,
-                                   MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13,
-                                   MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25,
-                                   MAT26, MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35, MAT36, MAT37,
-                                   MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44, MAT45, MAT46, MAT47, MAT48, SCALE);
-
-    // Clamp results to [ 0, 255 ] and store them in dst
-    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
-}
-
-#endif // DYNAMIC_MATRIX_CONVOLUTION

diff --git a/src/core/CL/cl_kernels/convolution9x9.cl b/src/core/CL/cl_kernels/convolution9x9.cl
deleted file mode 100644
index 7e77c61..0000000
--- a/src/core/CL/cl_kernels/convolution9x9.cl
+++ /dev/null

@@ -1,403 +0,0 @@
-/*
- * Copyright (c) 2016-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#ifndef DATA_TYPE
-#define DATA_TYPE short
-#endif /* DATA_TYPE */
-
-#ifndef COMPUTE_TYPE
-#define COMPUTE_TYPE int
-#endif /* COMPUTE_TYPE */
-
-#ifndef DATA_TYPE_OUT
-#define DATA_TYPE_OUT uchar
-#endif /* DATA_TYPE_OUT */
-
-/** Compute a 1D horizontal convolution of size 9 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] left_pixel   Pointer to the left pixel
- * @param[in] left1_coeff  Weight of the most left pixel
- * @param[in] left2_coeff  Weight of the second left pixel
- * @param[in] left3_coeff  Weight of the third left pixel
- * @param[in] left4_coeff  Weight of the left pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] right1_coeff Weight of the right pixel
- * @param[in] right2_coeff Weight of the second right pixel
- * @param[in] right3_coeff Weight of the third right pixel
- * @param[in] right4_coeff Weight of the most right pixel
- *
- * @return a short8 containing 8 convoluted values.
- */
-VEC_DATA_TYPE(DATA_TYPE, 8)
-convolution1x9(
-    __global const uchar *left_pixel,
-    const short           left1_coeff,
-    const short           left2_coeff,
-    const short           left3_coeff,
-    const short           left4_coeff,
-    const short           middle_coeff,
-    const short           right1_coeff,
-    const short           right2_coeff,
-    const short           right3_coeff,
-    const short           right4_coeff)
-{
-    uchar16 temp = vload16(0, left_pixel);
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left3 = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left4 = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    middle = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right1 = CONVERT(temp.s56789abc, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right2 = CONVERT(temp.s6789abcd, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right3 = CONVERT(temp.s789abcde, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right4 = CONVERT(temp.s89abcdef, VEC_DATA_TYPE(DATA_TYPE, 8));
-
-    return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff + left3 * (VEC_DATA_TYPE(DATA_TYPE, 8))left3_coeff + left4 * (VEC_DATA_TYPE(DATA_TYPE,
-            8))left4_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE,
-                    8))right2_coeff + right3 * (VEC_DATA_TYPE(DATA_TYPE, 8))right3_coeff + right4 * (VEC_DATA_TYPE(DATA_TYPE, 8))right4_coeff;
-}
-
-/** Compute a 1D vertical convolution of size 9 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] src          Pointer to source image.
- * @param[in] up1_coeff    Weight of the most up pixel
- * @param[in] up2_coeff    Weight of the second up pixel
- * @param[in] up3_coeff    Weight of the third up pixel
- * @param[in] up4_coeff    Weight of the up pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] down1_coeff  Weight of the down pixel
- * @param[in] down2_coeff  Weight of the second down pixel
- * @param[in] down3_coeff  Weight of the third down pixel
- * @param[in] down4_coeff  Weight of the most down pixel
- *
- * @return a short8 containing 8 convoluted values.
- */
-VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-convolution9x1(
-    Image      *src,
-    const short up1_coeff,
-    const short up2_coeff,
-    const short up3_coeff,
-    const short up4_coeff,
-    const short middle_coeff,
-    const short down1_coeff,
-    const short down2_coeff,
-    const short down3_coeff,
-    const short down4_coeff)
-{
-    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-    val;
-    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-    out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -4)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up3_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up4_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down3_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 4)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down4_coeff;
-
-    return out;
-}
-
-/** Apply a 9x9 convolution matrix to a single channel U8 input image and return the result.
- *
- * Convolution matrix layout:\n
- * [  mat0,  mat1,  mat2,  mat3 , mat4,  mat5,  mat6,  mat7, mat8 ]\n
- * [  mat9,  mat10, mat11, mat12, mat13, mat14, mat15, mat16, mat17 ]\n
- * [  mat18, mat19, mat20, mat21, mat22, mat23, mat24, mat25, mat26 ]\n
- * [  mat27, mat28, mat29, mat30, mat31, mat32, mat33, mat34, mat35 ]\n
- * [  mat36, mat37, mat38, mat39, mat40, mat41, mat42, mat43, mat44 ]\n
- * [  mat45, mat46, mat47, mat48, mat49, mat50, mat51, mat52, mat53 ]\n
- * [  mat54, mat55, mat56, mat57, mat58, mat59, mat60, mat61, mat62 ]
- * [  mat63, mat64, mat65, mat66, mat67, mat68, mat69, mat70, mat71 ]
- * [  mat72, mat73, mat74, mat75, mat76, mat77, mat78, mat79, mat80 ]
- *
- * @param[in] src   A pointer to source Image structure.
- * @param[in] mat0  Coefficient from the convolution matrix
- * @param[in] mat1  Coefficient from the convolution matrix
- * @param[in] mat2  Coefficient from the convolution matrix
- * @param[in] mat3  Coefficient from the convolution matrix
- * @param[in] mat4  Coefficient from the convolution matrix
- * @param[in] mat5  Coefficient from the convolution matrix
- * @param[in] mat6  Coefficient from the convolution matrix
- * @param[in] mat7  Coefficient from the convolution matrix
- * @param[in] mat8  Coefficient from the convolution matrix
- * @param[in] mat9  Coefficient from the convolution matrix
- * @param[in] mat10 Coefficient from the convolution matrix
- * @param[in] mat11 Coefficient from the convolution matrix
- * @param[in] mat12 Coefficient from the convolution matrix
- * @param[in] mat13 Coefficient from the convolution matrix
- * @param[in] mat14 Coefficient from the convolution matrix
- * @param[in] mat15 Coefficient from the convolution matrix
- * @param[in] mat16 Coefficient from the convolution matrix
- * @param[in] mat17 Coefficient from the convolution matrix
- * @param[in] mat18 Coefficient from the convolution matrix
- * @param[in] mat19 Coefficient from the convolution matrix
- * @param[in] mat20 Coefficient from the convolution matrix
- * @param[in] mat21 Coefficient from the convolution matrix
- * @param[in] mat22 Coefficient from the convolution matrix
- * @param[in] mat23 Coefficient from the convolution matrix
- * @param[in] mat24 Coefficient from the convolution matrix
- * @param[in] mat25 Coefficient from the convolution matrix
- * @param[in] mat26 Coefficient from the convolution matrix
- * @param[in] mat27 Coefficient from the convolution matrix
- * @param[in] mat28 Coefficient from the convolution matrix
- * @param[in] mat29 Coefficient from the convolution matrix
- * @param[in] mat30 Coefficient from the convolution matrix
- * @param[in] mat31 Coefficient from the convolution matrix
- * @param[in] mat32 Coefficient from the convolution matrix
- * @param[in] mat33 Coefficient from the convolution matrix
- * @param[in] mat34 Coefficient from the convolution matrix
- * @param[in] mat35 Coefficient from the convolution matrix
- * @param[in] mat36 Coefficient from the convolution matrix
- * @param[in] mat37 Coefficient from the convolution matrix
- * @param[in] mat38 Coefficient from the convolution matrix
- * @param[in] mat39 Coefficient from the convolution matrix
- * @param[in] mat40 Coefficient from the convolution matrix
- * @param[in] mat41 Coefficient from the convolution matrix
- * @param[in] mat42 Coefficient from the convolution matrix
- * @param[in] mat43 Coefficient from the convolution matrix
- * @param[in] mat44 Coefficient from the convolution matrix
- * @param[in] mat45 Coefficient from the convolution matrix
- * @param[in] mat46 Coefficient from the convolution matrix
- * @param[in] mat47 Coefficient from the convolution matrix
- * @param[in] mat48 Coefficient from the convolution matrix
- * @param[in] mat49 Coefficient from the convolution matrix
- * @param[in] mat50 Coefficient from the convolution matrix
- * @param[in] mat51 Coefficient from the convolution matrix
- * @param[in] mat52 Coefficient from the convolution matrix
- * @param[in] mat53 Coefficient from the convolution matrix
- * @param[in] mat54 Coefficient from the convolution matrix
- * @param[in] mat55 Coefficient from the convolution matrix
- * @param[in] mat56 Coefficient from the convolution matrix
- * @param[in] mat57 Coefficient from the convolution matrix
- * @param[in] mat58 Coefficient from the convolution matrix
- * @param[in] mat59 Coefficient from the convolution matrix
- * @param[in] mat60 Coefficient from the convolution matrix
- * @param[in] mat61 Coefficient from the convolution matrix
- * @param[in] mat62 Coefficient from the convolution matrix
- * @param[in] mat63 Coefficient from the convolution matrix
- * @param[in] mat64 Coefficient from the convolution matrix
- * @param[in] mat65 Coefficient from the convolution matrix
- * @param[in] mat66 Coefficient from the convolution matrix
- * @param[in] mat67 Coefficient from the convolution matrix
- * @param[in] mat68 Coefficient from the convolution matrix
- * @param[in] mat69 Coefficient from the convolution matrix
- * @param[in] mat70 Coefficient from the convolution matrix
- * @param[in] mat71 Coefficient from the convolution matrix
- * @param[in] mat72 Coefficient from the convolution matrix
- * @param[in] mat73 Coefficient from the convolution matrix
- * @param[in] mat74 Coefficient from the convolution matrix
- * @param[in] mat75 Coefficient from the convolution matrix
- * @param[in] mat76 Coefficient from the convolution matrix
- * @param[in] mat77 Coefficient from the convolution matrix
- * @param[in] mat78 Coefficient from the convolution matrix
- * @param[in] mat79 Coefficient from the convolution matrix
- * @param[in] mat80 Coefficient from the convolution matrix
- * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
- *
- */
-short8 convolution9x9(
-    Image      *src,
-    const short mat0, const short mat1, const short mat2, const short mat3, const short mat4,
-    const short mat5, const short mat6, const short mat7, const short mat8, const short mat9,
-    const short mat10, const short mat11, const short mat12, const short mat13, const short mat14,
-    const short mat15, const short mat16, const short mat17, const short mat18, const short mat19,
-    const short mat20, const short mat21, const short mat22, const short mat23, const short mat24,
-    const short mat25, const short mat26, const short mat27, const short mat28, const short mat29,
-    const short mat30, const short mat31, const short mat32, const short mat33, const short mat34,
-    const short mat35, const short mat36, const short mat37, const short mat38, const short mat39,
-    const short mat40, const short mat41, const short mat42, const short mat43, const short mat44,
-    const short mat45, const short mat46, const short mat47, const short mat48, const short mat49,
-    const short mat50, const short mat51, const short mat52, const short mat53, const short mat54,
-    const short mat55, const short mat56, const short mat57, const short mat58, const short mat59,
-    const short mat60, const short mat61, const short mat62, const short mat63, const short mat64,
-    const short mat65, const short mat66, const short mat67, const short mat68, const short mat69,
-    const short mat70, const short mat71, const short mat72, const short mat73, const short mat74,
-    const short mat75, const short mat76, const short mat77, const short mat78, const short mat79,
-    const short mat80, uint scale)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    pixels;
-
-    pixels = convolution1x9(offset(src, -4, -4), mat0, mat1, mat2, mat3, mat4, mat5, mat6, mat7, mat8);
-    pixels += convolution1x9(offset(src, -4, -3), mat9, mat10, mat11, mat12, mat13, mat14, mat15, mat16, mat17);
-    pixels += convolution1x9(offset(src, -4, -2), mat18, mat19, mat20, mat21, mat22, mat23, mat24, mat25, mat26);
-    pixels += convolution1x9(offset(src, -4, -1), mat27, mat28, mat29, mat30, mat31, mat32, mat33, mat34, mat35);
-    pixels += convolution1x9(offset(src, -4, 0), mat36, mat37, mat38, mat39, mat40, mat41, mat42, mat43, mat44);
-    pixels += convolution1x9(offset(src, -4, 1), mat45, mat46, mat47, mat48, mat49, mat50, mat51, mat52, mat53);
-    pixels += convolution1x9(offset(src, -4, 2), mat54, mat55, mat56, mat57, mat58, mat59, mat60, mat61, mat62);
-    pixels += convolution1x9(offset(src, -4, 3), mat63, mat64, mat65, mat66, mat67, mat68, mat69, mat70, mat71);
-    pixels += convolution1x9(offset(src, -4, 4), mat72, mat73, mat74, mat75, mat76, mat77, mat78, mat79, mat80);
-
-    if(scale > 0)
-    {
-        pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
-    }
-
-    return convert_short8_sat(pixels);
-}
-
-#ifndef DYNAMIC_MATRIX_CONVOLUTION
-
-/** Apply a 1x9 static convolution matrix to a single channel U8 input image and output a single temporary channel image.
- *
- * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8) and DATA_TYPE need to be passed at compile time:\n
- * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT8=8, -DCOMPUTE_TYPE=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16, S16, S32
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution_separable1x9_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Output pixels
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    pixels = convolution1x9(offset(&src, -4, 0), MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8);
-
-    // Store result in dst
-    vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr);
-}
-
-/** Apply a 9x1 static convolution matrix to a single channel U8 input image and output a single channel image.
- *
- * @attention The matrix coefficients (MAT9, MAT10, ... MAT17, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n
- * e.g. -DMAT9=9 -DMAT10=10, ... -DMAT17=17, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16, S16, S32
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution_separable9x1_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Output pixels
-    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-    pixels = convolution9x1(&src, MAT9, MAT10, MAT11, MAT12, MAT13, MAT14, MAT15, MAT16, MAT17);
-
-    // Divide by the scale
-    pixels = pixels / (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE;
-
-    // Store result in dst
-    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
-}
-
-/** Apply a static 9x9 convolution matrix to a single channel U8 input image and output a single channel image including borders
- *
- * @attention The matrix coefficients(MAT0, MAT1, ... MAT80, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n
- * e.g. -DMAT0=0 -DMAT1=1, ... -DMAT80=80, -DSCALE=6, -DDATA_TYPE_OUT=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution9x9_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    short8 pixels = convolution9x9(&src,
-                                   MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13,
-                                   MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25,
-                                   MAT26, MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35, MAT36, MAT37,
-                                   MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44, MAT45, MAT46, MAT47, MAT48, MAT49,
-                                   MAT50, MAT51, MAT52, MAT53, MAT54, MAT55, MAT56, MAT57, MAT58, MAT59, MAT60, MAT61,
-                                   MAT62, MAT63, MAT64, MAT65, MAT66, MAT67, MAT68, MAT69, MAT70, MAT71, MAT72, MAT73,
-                                   MAT74, MAT75, MAT76, MAT77, MAT78, MAT79, MAT80, SCALE);
-
-    // Store the result as is in dst
-    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
-}
-
-#endif // DYNAMIC_MATRIX_CONVOLUTION

diff --git a/src/core/CL/cl_kernels/convolution_rectangle.cl b/src/core/CL/cl_kernels/convolution_rectangle.cl
deleted file mode 100644
index 925a698..0000000
--- a/src/core/CL/cl_kernels/convolution_rectangle.cl
+++ /dev/null

@@ -1,118 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "convolution3x3.cl"
-#include "convolution5x5.cl"
-#include "convolution7x7.cl"
-#include "convolution9x9.cl"
-#include "helpers.h"
-
-#define MAT_INDEX(i) MAT##i
-
-#ifndef DATA_TYPE
-#define DATA_TYPE short
-#endif /* DATA_TYPE */
-
-#ifndef COMPUTE_TYPE
-#define COMPUTE_TYPE int
-#endif /* COMPUTE_TYPE */
-
-#ifndef DATA_TYPE_OUT
-#define DATA_TYPE_OUT uchar
-#endif /* DATA_TYPE_OUT */
-
-#ifndef DYNAMIC_MATRIX_CONVOLUTION
-
-/** Apply a rectangle matrix to a single channel U8 input image and output a single channel image including borders
- *
- * @attention The matrix coefficients(MAT0, MAT1, ... MAT80, SCALE), MATRIX_WIDTH, MATRIX_HEIGHT, COMPUTE_TYPE, DATA_TYPE, DATA_TYPE_OUT need to be passed at compile time:\n
- * e.g. -DMAT0=0 -DMAT1=1, ... -DMAT80=80, -DSCALE=6, -DMATRIX_WIDTH=3, -DMATRIX_HEIGHT=5, -DCOMPUTE_TYPE=int, -DDATA_TYPE=int, -DDATA_TYPE_OUT=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution_rectangle(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    short matrix_coeff[81] =
-    {
-        MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8,
-        MAT9, MAT10, MAT11, MAT12, MAT13, MAT14, MAT15, MAT16, MAT17,
-        MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25, MAT26,
-        MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35,
-        MAT36, MAT37, MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44,
-        MAT45, MAT46, MAT47, MAT48, MAT49, MAT50, MAT51, MAT52, MAT53,
-        MAT54, MAT55, MAT56, MAT57, MAT58, MAT59, MAT60, MAT61, MAT62,
-        MAT63, MAT64, MAT65, MAT66, MAT67, MAT68, MAT69, MAT70, MAT71,
-        MAT72, MAT73, MAT74, MAT75, MAT76, MAT77, MAT78, MAT79, MAT80
-    };
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    pixels = (VEC_DATA_TYPE(DATA_TYPE, 8))0;
-
-    for(int i = 0; i < MATRIX_HEIGHT; i++)
-    {
-#if MATRIX_WIDTH == 3
-        pixels += convolution1x3(offset(&src, -1, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 3], matrix_coeff[1 + i * 3],
-                                 matrix_coeff[2 + i * 3]);
-#endif /* MATRIX_WIDTH */
-
-#if MATRIX_WIDTH == 5
-        pixels += convolution1x5(offset(&src, -2, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 5], matrix_coeff[1 + i * 5],
-                                 matrix_coeff[2 + i * 5], matrix_coeff[3 + i * 5], matrix_coeff[4 + i * 5]);
-#endif /* MATRIX_WIDTH */
-
-#if MATRIX_WIDTH == 7
-        pixels += convolution1x7(offset(&src, -3, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 7], matrix_coeff[1 + i * 7],
-                                 matrix_coeff[2 + i * 7], matrix_coeff[3 + i * 7], matrix_coeff[4 + i * 7],
-                                 matrix_coeff[5 + i * 7], matrix_coeff[6 + i * 7]);
-#endif /* MATRIX_WIDTH */
-
-#if MATRIX_WIDTH == 9
-        pixels += convolution1x9(offset(&src, -4, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 9], matrix_coeff[1 + i * 9],
-                                 matrix_coeff[2 + i * 9], matrix_coeff[3 + i * 9], matrix_coeff[4 + i * 9],
-                                 matrix_coeff[5 + i * 9], matrix_coeff[6 + i * 9], matrix_coeff[7 + i * 9], matrix_coeff[8 + i * 9]);
-#endif /* MATRIX_WIDTH */
-    }
-
-    pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))SCALE;
-
-    // Store the result as is in dst
-    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, ((__global DATA_TYPE_OUT *)dst.ptr));
-}
-
-#endif /* not DYNAMIC_MATRIX_CONVOLUTION */

diff --git a/src/core/CL/cl_kernels/derivative.cl b/src/core/CL/cl_kernels/derivative.cl
deleted file mode 100644
index dddbb4d..0000000
--- a/src/core/CL/cl_kernels/derivative.cl
+++ /dev/null

@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This OpenCL kernel that computes the first-order derivative.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void derivative(
-    IMAGE_DECLARATION(src)
-#ifdef GRAD_X
-    ,
-    IMAGE_DECLARATION(dst_gx)
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    ,
-    IMAGE_DECLARATION(dst_gy)
-#endif /* GRAD_Y */
-)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-#ifdef GRAD_X
-    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-
-#ifdef GRAD_X
-    short16 l_data = convert_short16(vload16(0, offset(&src, -1, 0)));
-    short16 r_data = convert_short16(vload16(0, offset(&src, 1, 0)));
-    vstore16(r_data - l_data, 0, ((__global short *)dst_gx.ptr));
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    short16 t_data = convert_short16(vload16(0, offset(&src, 0, -1)));
-    short16 b_data = convert_short16(vload16(0, offset(&src, 0, 1)));
-    vstore16(b_data - t_data, 0, ((__global short *)dst_gy.ptr));
-#endif /* GRAD_Y */
-}

diff --git a/src/core/CL/cl_kernels/dilate.cl b/src/core/CL/cl_kernels/dilate.cl
deleted file mode 100644
index 14362c1..0000000
--- a/src/core/CL/cl_kernels/dilate.cl
+++ /dev/null

@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function dilates an input image.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void dilate(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 top    = vload16(0, offset(&src, -1, -1));
-    uchar16 middle = vload16(0, offset(&src, -1, 0));
-    uchar16 bottom = vload16(0, offset(&src, -1, 1));
-
-    uchar16 tmp = max(top, max(middle, bottom));
-    uchar8  out = max(tmp.s01234567, max(tmp.s12345678, tmp.s23456789));
-
-    vstore8(out, 0, dst.ptr);
-}

diff --git a/src/core/CL/cl_kernels/erode.cl b/src/core/CL/cl_kernels/erode.cl
deleted file mode 100644
index 810c5fc..0000000
--- a/src/core/CL/cl_kernels/erode.cl
+++ /dev/null

@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function erodes an input image image.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void erode(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 top    = vload16(0, offset(&src, -1, -1));
-    uchar16 middle = vload16(0, offset(&src, -1, 0));
-    uchar16 bottom = vload16(0, offset(&src, -1, 1));
-
-    uchar16 tmp = min(top, min(middle, bottom));
-    uchar8  out = min(tmp.s01234567, min(tmp.s12345678, tmp.s23456789));
-
-    vstore8(out, 0, dst.ptr);
-}

diff --git a/src/core/CL/cl_kernels/fast_corners.cl b/src/core/CL/cl_kernels/fast_corners.cl
deleted file mode 100644
index 89c144a..0000000
--- a/src/core/CL/cl_kernels/fast_corners.cl
+++ /dev/null

@@ -1,262 +0,0 @@
-/*
- * Copyright (c) 2016-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "types.h"
-
-/* The map table to retrieve the 16 texels in the Bresenham circle of radius 3 with center in P.
- *
- *      . . F 0 1 . . .
- *      . E . . . 2 . .
- *      D . . . . . 3 .
- *      C . . P . . 4 .
- *      B . . . . . 5 .
- *      . A . . . 6 . .
- *      . . 9 8 7 . . .
- */
-constant int offsets_s[16][2] =
-{
-    { 0, -3 },  // 0
-    { 1, -3 },  // 1
-    { 2, -2 },  // 2
-    { 3, -1 },  // 3
-    { 3, 0 },   // 4
-    { 3, 1 },   // 5
-    { 2, 2 },   // 6
-    { 1, 3 },   // 7
-    { 0, 3 },   // 8
-    { -1, 3 },  // 9
-    { -2, 2 },  // A
-    { -3, 1 },  // B
-    { -3, 0 },  // C
-    { -3, -1 }, // D
-    { -2, -2 }, // E
-    { -1, -3 }, // F
-};
-
-/** Load a pixel and set the mask values.
- *
- * @param[in]  ptr         The pointer to the starting address of source image
- * @param[in]  a           Index to indicate the position in the Bresenham circle
- * @param[in]  stride      Stride of source image in x dimension
- * @param[in]  dark        The left end of the threshold range
- * @param[in]  bright      The right end of the threshold range
- * @param[out] dark_mask   The bit-set mask records dark pixels. Its bit is set as 1 if the corresponding pixel is dark
- * @param[out] bright_mask The bit-set mask records bright pixels. Its bit is set as 1 if the corresponding pixel is bright
- *
- */
-#define LOAD_AND_SET_MASK(ptr, a, stride, dark, bright, dark_mask, bright_mask) \
-    {                                                                           \
-        unsigned char pixel;                                                    \
-        pixel = *(ptr + (int)stride * offsets_s[a][1] + offsets_s[a][0]);       \
-        dark_mask |= (pixel < dark) << a;                                       \
-        bright_mask |= (pixel > bright) << a;                                   \
-    }
-
-/** Checks if a pixel is a corner. Pixel is considerred as a corner if the 9 continuous pixels in the Bresenham circle are bright or dark.
- *
- * @param[in]  bright_mask The mask recording postions of bright pixels
- * @param[in]  dark_mask   The mask recording postions of dark pixels
- * @param[out] isCorner    Indicate whether candidate pixel is corner
- */
-#define CHECK_CORNER(bright_mask, dark_mask, isCorner)    \
-    {                                                     \
-        for(int i = 0; i < 16; i++)                       \
-        {                                                 \
-            isCorner |= ((bright_mask & 0x1FF) == 0x1FF); \
-            isCorner |= ((dark_mask & 0x1FF) == 0x1FF);   \
-            if(isCorner)                                  \
-            {                                             \
-                break;                                    \
-            }                                             \
-            bright_mask >>= 1;                            \
-            dark_mask >>= 1;                              \
-        }                                                 \
-    }
-
-/* Calculate pixel's strength */
-uchar compute_strength(uchar candidate_pixel, __global unsigned char *ptr, unsigned int stride, unsigned char threshold)
-{
-    short a = threshold;
-    short b = 255;
-    while(b - a > 1)
-    {
-        uchar        c           = convert_uchar_sat((a + b) / 2);
-        unsigned int bright_mask = 0;
-        unsigned int dark_mask   = 0;
-
-        unsigned char p_bright = add_sat(candidate_pixel, c);
-        unsigned char p_dark   = sub_sat(candidate_pixel, c);
-
-        bool isCorner = 0;
-
-        for(uint i = 0; i < 16; i++)
-        {
-            LOAD_AND_SET_MASK(ptr, i, stride, p_dark, p_bright, dark_mask, bright_mask)
-        }
-
-        bright_mask |= (bright_mask << 16);
-        dark_mask |= (dark_mask << 16);
-        CHECK_CORNER(bright_mask, dark_mask, isCorner);
-
-        if(isCorner)
-        {
-            a = convert_short(c);
-        }
-        else
-        {
-            b = convert_short(c);
-        }
-    }
-    return a;
-}
-
-/** Fast corners implementation. Calculates and returns the strength of each pixel.
- *
- * The algorithm loops through the 16 pixels in the Bresenham circle and set low 16 bit of masks if corresponding pixel is bright
- * or dark. It then copy the low 16 bit to the high 16 bit of the masks. Right shift the bit to check whether the 9 continuous bits
- * from the LSB are set.
- *
- * @param[in]  input_ptr                            Pointer to the first source image. Supported data types: U8
- * @param[in]  input_stride_x                       Stride of the first source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source image
- * @param[out] output_ptr                           Pointer to the first source image. Supported data types: U8
- * @param[in]  output_stride_x                      Stride of the first source image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the first source image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the first source image
- * @param[in]  threshold_value                      Threshold value.
- *
- */
-__kernel void fast_corners(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output),
-    float threshold_value)
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-
-    const unsigned char threshold = (uchar)threshold_value;
-
-    unsigned int bright_mask = 0;
-    unsigned int dark_mask   = 0;
-
-    unsigned char isCorner = 0;
-
-    unsigned char p        = *in.ptr;
-    unsigned char p_bright = add_sat(p, threshold);
-    unsigned char p_dark   = sub_sat(p, threshold);
-
-    LOAD_AND_SET_MASK(in.ptr, 0, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 4, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 8, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 12, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-
-    if(((bright_mask | dark_mask) & 0x1111) == 0)
-    {
-        *out.ptr = 0;
-        return;
-    }
-
-    LOAD_AND_SET_MASK(in.ptr, 1, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 2, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 3, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 5, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 6, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 7, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 9, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 10, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 11, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 13, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 14, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 15, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-
-    bright_mask |= (bright_mask << 16);
-    dark_mask |= (dark_mask << 16);
-
-    CHECK_CORNER(bright_mask, dark_mask, isCorner)
-
-    if(!isCorner)
-    {
-        *out.ptr = 0;
-        return;
-    }
-
-#ifdef USE_MAXSUPPRESSION
-    *out.ptr = compute_strength(p, in.ptr, input_stride_y, threshold);
-#else  /* USE_MAXSUPPRESSION */
-    *out.ptr = 1;
-#endif /* USE_MAXSUPPRESSION */
-}
-
-/** Copy result to Keypoint buffer and count number of corners
- *
- * @param[in]  input_ptr                           Pointer to the image with calculated strenghs. Supported data types: U8
- * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
- * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
- * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
- * @param[in]  max_num_points                      The maximum number of keypoints the array can hold
- * @param[out] offset                              The number of skipped pixels in x dimension
- * @param[out] num_of_points                       Number of points found
- * @param[out] out                                 The keypoints found
- *
- */
-__kernel void copy_to_keypoint(
-    IMAGE_DECLARATION(input),
-    uint     max_num_points,
-    uint     offset,
-    __global uint *num_of_points,
-    __global Keypoint *out)
-{
-#ifndef UPDATE_NUMBER
-    if(*num_of_points >= max_num_points)
-    {
-        return;
-    }
-#endif /* UPDATE_NUMBER */
-
-    Image in = CONVERT_TO_IMAGE_STRUCT(input);
-
-    uchar value = *in.ptr;
-
-    if(value > 0)
-    {
-        int id = atomic_inc(num_of_points);
-        if(id < max_num_points)
-        {
-            out[id].strength        = value;
-            out[id].x               = get_global_id(0) + offset;
-            out[id].y               = get_global_id(1) + offset;
-            out[id].tracking_status = 1;
-            out[id].scale           = 0.f;
-            out[id].orientation     = 0.f;
-            out[id].error           = 0.f;
-        }
-    }
-}

diff --git a/src/core/CL/cl_kernels/gaussian_pyramid.cl b/src/core/CL/cl_kernels/gaussian_pyramid.cl
deleted file mode 100644
index ae2c31a..0000000
--- a/src/core/CL/cl_kernels/gaussian_pyramid.cl
+++ /dev/null

@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Computes the Gaussian Filter 1x5 + sub-sampling along the X direction
- *
- * @note Each thread computes 8 pixels
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void gaussian1x5_sub_x(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values for the convolution (20 bytes needed)
-    uchar16 temp0 = vload16(0, src.ptr);
-    uchar4  temp1 = vload4(0, src.ptr + 16);
-
-    // Convert to USHORT8
-    ushort8 l2_data = convert_ushort8((uchar8)(temp0.s02468ACE));
-    ushort8 l1_data = convert_ushort8((uchar8)(temp0.s13579BDF));
-    ushort8 m_data  = convert_ushort8((uchar8)(temp0.s2468, temp0.sACE, temp1.s0));
-    ushort8 r1_data = convert_ushort8((uchar8)(temp0.s3579, temp0.sBDF, temp1.s1));
-    ushort8 r2_data = convert_ushort8((uchar8)(temp0.s468A, temp0.sCE, temp1.s02));
-
-    // Compute convolution along the X direction
-    ushort8 pixels = l2_data + r2_data;
-    pixels += l1_data * (ushort8)4;
-    pixels += m_data * (ushort8)6;
-    pixels += r1_data * (ushort8)4;
-
-    // Store result
-    vstore8(pixels, 0, (__global ushort *)dst.ptr);
-}
-
-/** Computes the Gaussian Filter 5x1 + sub-sampling along the Y direction
- *
- * @note Each thread computes 8 pixels
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void gaussian5x1_sub_y(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values
-    ushort8 u2_data = vload8(0, (__global ushort *)offset(&src, 0, 0));
-    ushort8 u1_data = vload8(0, (__global ushort *)offset(&src, 0, 1));
-    ushort8 m_data  = vload8(0, (__global ushort *)offset(&src, 0, 2));
-    ushort8 d1_data = vload8(0, (__global ushort *)offset(&src, 0, 3));
-    ushort8 d2_data = vload8(0, (__global ushort *)offset(&src, 0, 4));
-
-    // Compute convolution along the Y direction
-    ushort8 pixels = u2_data + d2_data;
-    pixels += u1_data * (ushort8)4;
-    pixels += m_data * (ushort8)6;
-    pixels += d1_data * (ushort8)4;
-
-    // Scale result
-    pixels >>= (ushort8)8;
-
-    // Store result
-    vstore8(convert_uchar8_sat(pixels), 0, dst.ptr);
-}

diff --git a/src/core/CL/cl_kernels/harris_corners.cl b/src/core/CL/cl_kernels/harris_corners.cl
deleted file mode 100644
index 3e3c9fd..0000000
--- a/src/core/CL/cl_kernels/harris_corners.cl
+++ /dev/null

@@ -1,376 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Function running harris score on 3x3 block size
- *
- * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int.
- *             e.g. -DDATA_TYPE=short.
- *
- * @param[in]  src_gx_ptr                           Pointer to the first source image. Supported data types: S16, S32
- * @param[in]  src_gx_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_gx_step_x                        src_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_gx_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_gx_step_y                        src_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_gx_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  src_gy_ptr                           Pointer to the second source image. Supported data types: S16, S32
- * @param[in]  src_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  src_gy_step_x                        src_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  src_gy_step_y                        src_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] vc_ptr                               Pointer to the destination image. Supported data types: F32
- * @param[in]  vc_stride_x                          Stride of the destination image in X dimension (in bytes)
- * @param[in]  vc_step_x                            vc_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  vc_stride_y                          Stride of the destination image in Y dimension (in bytes)
- * @param[in]  vc_step_y                            vc_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  vc_offset_first_element_in_bytes     The offset of the first element in the destination image
- * @param[in]  sensitivity                          Sensitivity threshold k from the Harris-Stephens equation
- * @param[in]  strength_thresh                      Minimum threshold with which to eliminate Harris Corner scores
- * @param[in]  pow4_normalization_factor            Normalization factor to apply harris score
- */
-__kernel void harris_score_3x3(
-    IMAGE_DECLARATION(src_gx),
-    IMAGE_DECLARATION(src_gy),
-    IMAGE_DECLARATION(vc),
-    float sensitivity,
-    float strength_thresh,
-    float pow4_normalization_factor)
-{
-    Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx);
-    Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy);
-    Image vc     = CONVERT_TO_IMAGE_STRUCT(vc);
-
-    /* Gx^2, Gy^2 and Gx*Gy */
-    float4 gx2  = (float4)0.0f;
-    float4 gy2  = (float4)0.0f;
-    float4 gxgy = (float4)0.0f;
-
-    /* Row0 */
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, -1));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, -1));
-
-    float4 l_gx = convert_float4(temp_gx.s0123);
-    float4 m_gx = convert_float4(temp_gx.s1234);
-    float4 r_gx = convert_float4(temp_gx.s2345);
-
-    float4 l_gy = convert_float4(temp_gy.s0123);
-    float4 m_gy = convert_float4(temp_gy.s1234);
-    float4 r_gy = convert_float4(temp_gy.s2345);
-
-    gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx);
-    gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy);
-    gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy);
-
-    /* Row1 */
-    temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, 0));
-    temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, 0));
-
-    l_gx = convert_float4(temp_gx.s0123);
-    m_gx = convert_float4(temp_gx.s1234);
-    r_gx = convert_float4(temp_gx.s2345);
-
-    l_gy = convert_float4(temp_gy.s0123);
-    m_gy = convert_float4(temp_gy.s1234);
-    r_gy = convert_float4(temp_gy.s2345);
-
-    gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx);
-    gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy);
-    gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy);
-
-    /* Row2 */
-    temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, 1));
-    temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, 1));
-
-    l_gx = convert_float4(temp_gx.s0123);
-    m_gx = convert_float4(temp_gx.s1234);
-    r_gx = convert_float4(temp_gx.s2345);
-
-    l_gy = convert_float4(temp_gy.s0123);
-    m_gy = convert_float4(temp_gy.s1234);
-    r_gy = convert_float4(temp_gy.s2345);
-
-    gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx);
-    gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy);
-    gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy);
-
-    /* Compute trace and determinant */
-    float4 trace = gx2 + gy2;
-    float4 det   = gx2 * gy2 - (gxgy * gxgy);
-
-    /* Compute harris score */
-    float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor;
-
-    mc = select(0.0f, mc, mc > (float4)strength_thresh);
-
-    vstore4(mc, 0, (__global float *)vc.ptr);
-}
-
-/** Function for calculating harris score 1x5.
- *
- * @param[in] src_gx Pointer to gx gradient image.
- * @param[in] src_gy Pointer to gy gradient image.
- * @param[in] row    Relative row.
- */
-inline float16 harris_score_1x5(Image *src_gx, Image *src_gy, int row)
-{
-    float4 gx2  = 0.0f;
-    float4 gy2  = 0.0f;
-    float4 gxgy = 0.0f;
-
-    /* Row */
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp_gx = vload8(0, (__global DATA_TYPE *)offset(src_gx, -2, row));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp_gy = vload8(0, (__global DATA_TYPE *)offset(src_gy, -2, row));
-
-    float4 gx = convert_float4(temp_gx.s0123);
-    float4 gy = convert_float4(temp_gy.s0123);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4(temp_gx.s1234);
-    gy = convert_float4(temp_gy.s1234);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4(temp_gx.s2345);
-    gy = convert_float4(temp_gy.s2345);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4(temp_gx.s3456);
-    gy = convert_float4(temp_gy.s3456);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4(temp_gx.s4567);
-    gy = convert_float4(temp_gy.s4567);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    return (float16)(gx2, gy2, gxgy, (float4)0);
-}
-
-/** Function running harris score on 5x5 block size
- *
- * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int.
- *             e.g. -DDATA_TYPE=short.
- *
- * @param[in]  src_gx_ptr                           Pointer to the first source image. Supported data types: S16, S32
- * @param[in]  src_gx_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_gx_step_x                        src_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_gx_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_gx_step_y                        src_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_gx_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  src_gy_ptr                           Pointer to the second source image. Supported data types: S16, S32
- * @param[in]  src_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  src_gy_step_x                        src_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  src_gy_step_y                        src_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] vc_ptr                               Pointer to the destination image. Supported data types: F32
- * @param[in]  vc_stride_x                          Stride of the destination image in X dimension (in bytes)
- * @param[in]  vc_step_x                            vc_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  vc_stride_y                          Stride of the destination image in Y dimension (in bytes)
- * @param[in]  vc_step_y                            vc_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  vc_offset_first_element_in_bytes     The offset of the first element in the destination image
- * @param[in]  sensitivity                          Sensitivity threshold k from the Harris-Stephens equation
- * @param[in]  strength_thresh                      Minimum threshold with which to eliminate Harris Corner scores
- * @param[in]  pow4_normalization_factor            Normalization factor to apply harris score
- */
-__kernel void harris_score_5x5(
-    IMAGE_DECLARATION(src_gx),
-    IMAGE_DECLARATION(src_gy),
-    IMAGE_DECLARATION(vc),
-    float sensitivity,
-    float strength_thresh,
-    float pow4_normalization_factor)
-{
-    Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx);
-    Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy);
-    Image vc     = CONVERT_TO_IMAGE_STRUCT(vc);
-
-    /* Gx^2, Gy^2 and Gx*Gy */
-    float16 res = (float16)0.0f;
-
-    /* Compute row */
-    for(int i = -2; i < 3; i++)
-    {
-        res += harris_score_1x5(&src_gx, &src_gy, i);
-    }
-
-    float4 gx2  = res.s0123;
-    float4 gy2  = res.s4567;
-    float4 gxgy = res.s89AB;
-
-    /* Compute trace and determinant */
-    float4 trace = gx2 + gy2;
-    float4 det   = gx2 * gy2 - (gxgy * gxgy);
-
-    /* Compute harris score */
-    float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor;
-
-    mc = select(0.0f, mc, mc > (float4)strength_thresh);
-
-    vstore4(mc, 0, (__global float *)vc.ptr);
-}
-
-/** Function for calculating harris score 1x7.
- *
- * @param[in] src_gx Pointer to gx gradient image.
- * @param[in] src_gy Pointer to gy gradient image.
- * @param[in] row    Relative row.
- */
-inline float16 harris_score_1x7(Image *src_gx, Image *src_gy, int row)
-{
-    float4 gx2  = 0.0f;
-    float4 gy2  = 0.0f;
-    float4 gxgy = 0.0f;
-
-    /* Row */
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp_gx0 = vload8(0, (__global DATA_TYPE *)offset(src_gx, -3, row));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp_gy0 = vload8(0, (__global DATA_TYPE *)offset(src_gy, -3, row));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    temp_gx1 = vload2(0, (__global DATA_TYPE *)offset(src_gx, 5, row));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    temp_gy1 = vload2(0, (__global DATA_TYPE *)offset(src_gy, 5, row));
-
-    float4 gx = convert_float4(temp_gx0.s0123);
-    float4 gy = convert_float4(temp_gy0.s0123);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4(temp_gx0.s1234);
-    gy = convert_float4(temp_gy0.s1234);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4(temp_gx0.s2345);
-    gy = convert_float4(temp_gy0.s2345);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4(temp_gx0.s3456);
-    gy = convert_float4(temp_gy0.s3456);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4(temp_gx0.s4567);
-    gy = convert_float4(temp_gy0.s4567);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gx0.s567, temp_gx1.s0));
-    gy = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gy0.s567, temp_gy1.s0));
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gx0.s67, temp_gx1.s01));
-    gy = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gy0.s67, temp_gy1.s01));
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    return (float16)(gx2, gy2, gxgy, (float4)0);
-}
-
-/** Function running harris score on 7x7 block size
- *
- * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int.
- *             e.g. -DDATA_TYPE=short.
- *
- * @param[in]  src_gx_ptr                           Pointer to the first source image. Supported data types: S16, S32
- * @param[in]  src_gx_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_gx_step_x                        src_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_gx_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_gx_step_y                        src_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_gx_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  src_gy_ptr                           Pointer to the second source image. Supported data types: S16, S32
- * @param[in]  src_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  src_gy_step_x                        src_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  src_gy_step_y                        src_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] vc_ptr                               Pointer to the destination image. Supported data types: F32
- * @param[in]  vc_stride_x                          Stride of the destination image in X dimension (in bytes)
- * @param[in]  vc_step_x                            vc_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  vc_stride_y                          Stride of the destination image in Y dimension (in bytes)
- * @param[in]  vc_step_y                            vc_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  vc_offset_first_element_in_bytes     The offset of the first element in the destination image
- * @param[in]  sensitivity                          Sensitivity threshold k from the Harris-Stephens equation
- * @param[in]  strength_thresh                      Minimum threshold with which to eliminate Harris Corner scores
- * @param[in]  pow4_normalization_factor            Normalization factor to apply harris score
- */
-__kernel void harris_score_7x7(
-    IMAGE_DECLARATION(src_gx),
-    IMAGE_DECLARATION(src_gy),
-    IMAGE_DECLARATION(vc),
-    float sensitivity,
-    float strength_thresh,
-    float pow4_normalization_factor)
-{
-    Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx);
-    Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy);
-    Image vc     = CONVERT_TO_IMAGE_STRUCT(vc);
-
-    /* Gx^2, Gy^2 and Gx*Gy */
-    float16 res = (float16)0.0f;
-
-    /* Compute row */
-    for(int i = -3; i < 4; i++)
-    {
-        res += harris_score_1x7(&src_gx, &src_gy, i);
-    }
-
-    float4 gx2  = res.s0123;
-    float4 gy2  = res.s4567;
-    float4 gxgy = res.s89AB;
-
-    /* Compute trace and determinant */
-    float4 trace = gx2 + gy2;
-    float4 det   = gx2 * gy2 - (gxgy * gxgy);
-
-    /* Compute harris score */
-    float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor;
-
-    mc = select(0.0f, mc, mc > (float4)strength_thresh);
-
-    vstore4(mc, 0, (__global float *)vc.ptr);
-}

diff --git a/src/core/CL/cl_kernels/histogram.cl b/src/core/CL/cl_kernels/histogram.cl
deleted file mode 100644
index a93cb4d..0000000
--- a/src/core/CL/cl_kernels/histogram.cl
+++ /dev/null

@@ -1,243 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#define VATOMIC_INC16(histogram, win_pos)   \
-    {                                       \
-        atomic_inc(histogram + win_pos.s0); \
-        atomic_inc(histogram + win_pos.s1); \
-        atomic_inc(histogram + win_pos.s2); \
-        atomic_inc(histogram + win_pos.s3); \
-        atomic_inc(histogram + win_pos.s4); \
-        atomic_inc(histogram + win_pos.s5); \
-        atomic_inc(histogram + win_pos.s6); \
-        atomic_inc(histogram + win_pos.s7); \
-        atomic_inc(histogram + win_pos.s8); \
-        atomic_inc(histogram + win_pos.s9); \
-        atomic_inc(histogram + win_pos.sa); \
-        atomic_inc(histogram + win_pos.sb); \
-        atomic_inc(histogram + win_pos.sc); \
-        atomic_inc(histogram + win_pos.sd); \
-        atomic_inc(histogram + win_pos.se); \
-        atomic_inc(histogram + win_pos.sf); \
-    }
-
-/** Calculate the histogram of an 8 bit grayscale image.
- *
- * Each thread will process 16 pixels and use one local atomic operation per pixel.
- * When all work items in a work group are done the resulting local histograms are
- * added to the global histogram using global atomics.
- *
- * @note The input image is represented as a two-dimensional array of type uchar.
- * The output is represented as a one-dimensional uint array of length of num_bins
- *
- * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
- * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
- * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
- * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
- * @param[in]  histogram_local                     The local buffer to hold histogram result in per workgroup. Supported data types: U32
- * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
- * @param[out] num_bins                            The number of bins
- * @param[out] offset                              The start of values to use (inclusive)
- * @param[out] range                               The range of a bin
- * @param[out] offrange                            The maximum value (exclusive)
- */
-__kernel void hist_local_kernel(IMAGE_DECLARATION(input),
-                                __local uint *histogram_local,
-                                __global uint *restrict histogram,
-                                uint                    num_bins,
-                                uint                    offset,
-                                uint                    range,
-                                uint                    offrange)
-{
-    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
-    uint  local_id_x   = get_local_id(0);
-
-    uint local_x_size = get_local_size(0);
-
-    if(num_bins > local_x_size)
-    {
-        for(int i = local_id_x; i < num_bins; i += local_x_size)
-        {
-            histogram_local[i] = 0;
-        }
-    }
-    else
-    {
-        if(local_id_x <= num_bins)
-        {
-            histogram_local[local_id_x] = 0;
-        }
-    }
-
-    uint16 vals = convert_uint16(vload16(0, input_buffer.ptr));
-
-    uint16 win_pos = select(num_bins, ((vals - offset) * num_bins) / range, (vals >= offset && vals < offrange));
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-    VATOMIC_INC16(histogram_local, win_pos);
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(num_bins > local_x_size)
-    {
-        for(int i = local_id_x; i < num_bins; i += local_x_size)
-        {
-            atomic_add(histogram + i, histogram_local[i]);
-        }
-    }
-    else
-    {
-        if(local_id_x <= num_bins)
-        {
-            atomic_add(histogram + local_id_x, histogram_local[local_id_x]);
-        }
-    }
-}
-
-/** Calculate the histogram of an 8 bit grayscale image's border.
- *
- * Each thread will process one pixel using global atomic.
- * When all work items in a work group are done the resulting local histograms are
- * added to the global histogram using global atomics.
- *
- * @note The input image is represented as a two-dimensional array of type uchar.
- * The output is represented as a one-dimensional uint array of length of num_bins
- *
- * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
- * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
- * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
- * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
- * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
- * @param[out] num_bins                            The number of bins
- * @param[out] offset                              The start of values to use (inclusive)
- * @param[out] range                               The range of a bin
- * @param[out] offrange                            The maximum value (exclusive)
- */
-__kernel void hist_border_kernel(IMAGE_DECLARATION(input),
-                                 __global uint *restrict histogram,
-                                 uint                    num_bins,
-                                 uint                    offset,
-                                 uint                    range,
-                                 uint                    offrange)
-{
-    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
-
-    uint val = (uint)(*input_buffer.ptr);
-
-    uint win_pos = (val >= offset) ? (((val - offset) * num_bins) / range) : 0;
-
-    if(val >= offset && (val < offrange))
-    {
-        atomic_inc(histogram + win_pos);
-    }
-}
-
-/** Calculate the histogram of an 8 bit grayscale image with bin size of 256 and window size of 1.
- *
- * Each thread will process 16 pixels and use one local atomic operation per pixel.
- * When all work items in a work group are done the resulting local histograms are
- * added to the global histogram using global atomics.
- *
- * @note The input image is represented as a two-dimensional array of type uchar.
- * The output is represented as a one-dimensional uint array of 256 elements
- *
- * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
- * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
- * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
- * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
- * @param[in]  histogram_local                     The local buffer to hold histogram result in per workgroup. Supported data types: U32
- * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
- */
-__kernel void hist_local_kernel_fixed(IMAGE_DECLARATION(input),
-                                      __local uint *histogram_local,
-                                      __global uint *restrict histogram)
-{
-    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
-
-    uint local_index  = get_local_id(0);
-    uint local_x_size = get_local_size(0);
-
-    for(int i = local_index; i < 256; i += local_x_size)
-    {
-        histogram_local[i] = 0;
-    }
-
-    uint16 vals = convert_uint16(vload16(0, input_buffer.ptr));
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    atomic_inc(histogram_local + vals.s0);
-    atomic_inc(histogram_local + vals.s1);
-    atomic_inc(histogram_local + vals.s2);
-    atomic_inc(histogram_local + vals.s3);
-    atomic_inc(histogram_local + vals.s4);
-    atomic_inc(histogram_local + vals.s5);
-    atomic_inc(histogram_local + vals.s6);
-    atomic_inc(histogram_local + vals.s7);
-    atomic_inc(histogram_local + vals.s8);
-    atomic_inc(histogram_local + vals.s9);
-    atomic_inc(histogram_local + vals.sa);
-    atomic_inc(histogram_local + vals.sb);
-    atomic_inc(histogram_local + vals.sc);
-    atomic_inc(histogram_local + vals.sd);
-    atomic_inc(histogram_local + vals.se);
-    atomic_inc(histogram_local + vals.sf);
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    for(int i = local_index; i < 256; i += local_x_size)
-    {
-        atomic_add(histogram + i, histogram_local[i]);
-    }
-}
-
-/** Calculate the histogram of an 8 bit grayscale image with bin size as 256 and window size as 1.
- *
- * Each thread will process one pixel using global atomic.
- * When all work items in a work group are done the resulting local histograms are
- * added to the global histogram using global atomics.
- *
- * @note The input image is represented as a two-dimensional array of type uchar.
- * The output is represented as a one-dimensional uint array of 256
- *
- * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
- * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
- * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
- * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
- * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
- */
-__kernel void hist_border_kernel_fixed(IMAGE_DECLARATION(input),
-                                       __global uint *restrict histogram)
-{
-    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
-    atomic_inc(histogram + *input_buffer.ptr);
-}

diff --git a/src/core/CL/cl_kernels/hog.cl b/src/core/CL/cl_kernels/hog.cl
deleted file mode 100644
index b14f361..0000000
--- a/src/core/CL/cl_kernels/hog.cl
+++ /dev/null

@@ -1,456 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "types.h"
-
-#if defined(CELL_WIDTH) && defined(CELL_HEIGHT) && defined(NUM_BINS) && defined(PHASE_SCALE)
-
-/** This OpenCL kernel computes the HOG orientation binning
- *
- * @attention The following variables must be passed at compile time:
- *
- * -# -DCELL_WIDTH = Width of the cell
- * -# -DCELL_HEIGHT = height of the cell
- * -# -DNUM_BINS = Number of bins for each cell
- * -# -DPHASE_SCALE = Scale factor used to evaluate the index of the local HOG
- *
- * @note Each work-item computes a single cell
- *
- * @param[in]  mag_ptr                             Pointer to the source image which stores the magnitude of the gradient for each pixel. Supported data types: S16
- * @param[in]  mag_stride_x                        Stride of the magnitude image in X dimension (in bytes)
- * @param[in]  mag_step_x                          mag_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mag_stride_y                        Stride of the magnitude image in Y dimension (in bytes)
- * @param[in]  mag_step_y                          mag_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  mag_offset_first_element_in_bytes   The offset of the first element in the magnitude image
- * @param[in]  phase_ptr                           Pointer to the source image which stores the phase of the gradient for each pixel. Supported data types: U8
- * @param[in]  phase_stride_x                      Stride of the phase image in X dimension (in bytes)
- * @param[in]  phase_step_x                        phase_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  phase_stride_y                      Stride of the the phase image in Y dimension (in bytes)
- * @param[in]  phase_step_y                        phase_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  phase_offset_first_element_in_bytes The offset of the first element in the the phase image
- * @param[out] dst_ptr                             Pointer to the destination image which stores the local HOG for each cell Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
- * @param[in]  dst_stride_x                        Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                          dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                        Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                          dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes   The offset of the first element in the destination image
- */
-__kernel void hog_orientation_binning(IMAGE_DECLARATION(mag),
-                                      IMAGE_DECLARATION(phase),
-                                      IMAGE_DECLARATION(dst))
-{
-    float bins[NUM_BINS] = { 0 };
-
-    // Compute address for the magnitude and phase images
-    Image mag   = CONVERT_TO_IMAGE_STRUCT(mag);
-    Image phase = CONVERT_TO_IMAGE_STRUCT(phase);
-
-    __global uchar *mag_row_ptr   = mag.ptr;
-    __global uchar *phase_row_ptr = phase.ptr;
-
-    for(int yc = 0; yc < CELL_HEIGHT; ++yc)
-    {
-        int xc = 0;
-        for(; xc <= (CELL_WIDTH - 4); xc += 4)
-        {
-            // Load magnitude and phase values
-            const float4 mag_f32   = convert_float4(vload4(0, (__global short *)mag_row_ptr + xc));
-            float4       phase_f32 = convert_float4(vload4(0, phase_row_ptr + xc));
-
-            // Scale phase: phase * scale + 0.5f
-            phase_f32 = (float4)0.5f + phase_f32 * (float4)PHASE_SCALE;
-
-            // Compute histogram index.
-            int4 hidx_s32 = convert_int4(phase_f32);
-
-            // Compute magnitude weights (w0 and w1)
-            const float4 hidx_f32 = convert_float4(hidx_s32);
-
-            // w1 = phase_f32 - hidx_s32
-            const float4 w1_f32 = phase_f32 - hidx_f32;
-
-            // w0 = 1.0 - w1
-            const float4 w0_f32 = (float4)1.0f - w1_f32;
-
-            // Calculate the weights for splitting vote
-            const float4 mag_w0_f32 = mag_f32 * w0_f32;
-            const float4 mag_w1_f32 = mag_f32 * w1_f32;
-
-            // Weighted vote between 2 bins
-
-            // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0
-            hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));
-
-            // Bin 0
-            bins[hidx_s32.s0] += mag_w0_f32.s0;
-            bins[hidx_s32.s1] += mag_w0_f32.s1;
-            bins[hidx_s32.s2] += mag_w0_f32.s2;
-            bins[hidx_s32.s3] += mag_w0_f32.s3;
-
-            hidx_s32 += (int4)1;
-
-            // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0
-            hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));
-
-            // Bin1
-            bins[hidx_s32.s0] += mag_w1_f32.s0;
-            bins[hidx_s32.s1] += mag_w1_f32.s1;
-            bins[hidx_s32.s2] += mag_w1_f32.s2;
-            bins[hidx_s32.s3] += mag_w1_f32.s3;
-        }
-
-        // Left over computation
-        for(; xc < CELL_WIDTH; xc++)
-        {
-            const float mag_value   = *((__global short *)mag_row_ptr + xc);
-            const float phase_value = *(phase_row_ptr + xc) * (float)PHASE_SCALE + 0.5f;
-            const float w1          = phase_value - floor(phase_value);
-
-            // The quantised phase is the histogram index [0, NUM_BINS - 1]
-            // Check limit of histogram index. If hidx == NUM_BINS, hidx = 0
-            const uint hidx = (uint)(phase_value) % NUM_BINS;
-
-            // Weighted vote between 2 bins
-            bins[hidx] += mag_value * (1.0f - w1);
-            bins[(hidx + 1) % NUM_BINS] += mag_value * w1;
-        }
-
-        // Point to the next row of magnitude and phase images
-        mag_row_ptr += mag_stride_y;
-        phase_row_ptr += phase_stride_y;
-    }
-
-    // Compute address for the destination image
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Store the local HOG in the global memory
-    int xc = 0;
-    for(; xc <= (NUM_BINS - 4); xc += 4)
-    {
-        float4 values = vload4(0, bins + xc);
-
-        vstore4(values, 0, ((__global float *)dst.ptr) + xc);
-    }
-
-    // Left over stores
-    for(; xc < NUM_BINS; ++xc)
-    {
-        ((__global float *)dst.ptr)[xc] = bins[xc];
-    }
-}
-#endif /* CELL_WIDTH and CELL_HEIGHT and NUM_BINS and PHASE_SCALE */
-
-#if defined(NUM_CELLS_PER_BLOCK_HEIGHT) && defined(NUM_BINS_PER_BLOCK_X) && defined(NUM_BINS_PER_BLOCK) && defined(HOG_NORM_TYPE) && defined(L2_HYST_THRESHOLD)
-
-#ifndef L2_NORM
-#error The value of enum class HOGNormType::L2_NORM has not be passed to the OpenCL kernel
-#endif /* not L2_NORM */
-
-#ifndef L2HYS_NORM
-#error The value of enum class HOGNormType::L2HYS_NORM has not be passed to the OpenCL kernel
-#endif /* not L2HYS_NORM */
-
-#ifndef L1_NORM
-#error The value of enum class HOGNormType::L1_NORM has not be passed to the OpenCL kernel
-#endif /* not L1_NORM */
-
-/** This OpenCL kernel computes the HOG block normalization
- *
- * @attention The following variables must be passed at compile time:
- *
- * -# -DNUM_CELLS_PER_BLOCK_HEIGHT = Number of cells for each block
- * -# -DNUM_BINS_PER_BLOCK_X = Number of bins for each block along the X direction
- * -# -DNUM_BINS_PER_BLOCK = Number of bins for each block
- * -# -DHOG_NORM_TYPE = Normalization type
- * -# -DL2_HYST_THRESHOLD = Threshold used for L2HYS_NORM normalization method
- * -# -DL2_NORM = Value of the enum class HOGNormType::L2_NORM
- * -# -DL2HYS_NORM = Value of the enum class HOGNormType::L2HYS_NORM
- * -# -DL1_NORM = Value of the enum class HOGNormType::L1_NORM
- *
- * @note Each work-item computes a single block
- *
- * @param[in]  src_ptr                           Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image which stores the normlized HOG Supported data types: F32. Number of channels supported: equal to the number of histogram bins per block
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void hog_block_normalization(IMAGE_DECLARATION(src),
-                                      IMAGE_DECLARATION(dst))
-{
-    float  sum     = 0.0f;
-    float4 sum_f32 = (float4)(0.0f);
-
-    // Compute address for the source and destination tensor
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    for(size_t yc = 0; yc < NUM_CELLS_PER_BLOCK_HEIGHT; ++yc)
-    {
-        const __global float *hist_ptr = (__global float *)(src.ptr + yc * src_stride_y);
-
-        int xc = 0;
-        for(; xc <= (NUM_BINS_PER_BLOCK_X - 16); xc += 16)
-        {
-            const float4 val0 = vload4(0, hist_ptr + xc + 0);
-            const float4 val1 = vload4(0, hist_ptr + xc + 4);
-            const float4 val2 = vload4(0, hist_ptr + xc + 8);
-            const float4 val3 = vload4(0, hist_ptr + xc + 12);
-
-#if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
-            // Compute val^2 for L2_NORM or L2HYS_NORM
-            sum_f32 += val0 * val0;
-            sum_f32 += val1 * val1;
-            sum_f32 += val2 * val2;
-            sum_f32 += val3 * val3;
-#else  /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
-            // Compute |val| for L1_NORM
-            sum_f32 += fabs(val0);
-            sum_f32 += fabs(val1);
-            sum_f32 += fabs(val2);
-            sum_f32 += fabs(val3);
-#endif /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
-
-            // Store linearly the input values un-normalized in the output image. These values will be reused for the normalization.
-            // This approach will help us to be cache friendly in the next for loop where the normalization will be done because all the values
-            // will be accessed consecutively
-            vstore4(val0, 0, ((__global float *)dst.ptr) + xc + 0 + yc * NUM_BINS_PER_BLOCK_X);
-            vstore4(val1, 0, ((__global float *)dst.ptr) + xc + 4 + yc * NUM_BINS_PER_BLOCK_X);
-            vstore4(val2, 0, ((__global float *)dst.ptr) + xc + 8 + yc * NUM_BINS_PER_BLOCK_X);
-            vstore4(val3, 0, ((__global float *)dst.ptr) + xc + 12 + yc * NUM_BINS_PER_BLOCK_X);
-        }
-
-        // Compute left over
-        for(; xc < NUM_BINS_PER_BLOCK_X; ++xc)
-        {
-            const float val = hist_ptr[xc];
-
-#if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
-            sum += val * val;
-#else  /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
-            sum += fabs(val);
-#endif /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
-
-            ((__global float *)dst.ptr)[xc + 0 + yc * NUM_BINS_PER_BLOCK_X] = val;
-        }
-    }
-
-    sum += dot(sum_f32, (float4)1.0f);
-
-    float scale = 1.0f / (sqrt(sum) + NUM_BINS_PER_BLOCK * 0.1f);
-
-#if(HOG_NORM_TYPE == L2HYS_NORM)
-    // Reset sum
-    sum_f32 = (float4)0.0f;
-    sum     = 0.0f;
-
-    int k = 0;
-    for(; k <= NUM_BINS_PER_BLOCK - 16; k += 16)
-    {
-        float4 val0 = vload4(0, ((__global float *)dst.ptr) + k + 0);
-        float4 val1 = vload4(0, ((__global float *)dst.ptr) + k + 4);
-        float4 val2 = vload4(0, ((__global float *)dst.ptr) + k + 8);
-        float4 val3 = vload4(0, ((__global float *)dst.ptr) + k + 12);
-
-        // Scale val
-        val0 = val0 * (float4)scale;
-        val1 = val1 * (float4)scale;
-        val2 = val2 * (float4)scale;
-        val3 = val3 * (float4)scale;
-
-        // Clip val if over _threshold_l2hys
-        val0 = fmin(val0, (float4)L2_HYST_THRESHOLD);
-        val1 = fmin(val1, (float4)L2_HYST_THRESHOLD);
-        val2 = fmin(val2, (float4)L2_HYST_THRESHOLD);
-        val3 = fmin(val3, (float4)L2_HYST_THRESHOLD);
-
-        // Compute val^2
-        sum_f32 += val0 * val0;
-        sum_f32 += val1 * val1;
-        sum_f32 += val2 * val2;
-        sum_f32 += val3 * val3;
-
-        vstore4(val0, 0, ((__global float *)dst.ptr) + k + 0);
-        vstore4(val1, 0, ((__global float *)dst.ptr) + k + 4);
-        vstore4(val2, 0, ((__global float *)dst.ptr) + k + 8);
-        vstore4(val3, 0, ((__global float *)dst.ptr) + k + 12);
-    }
-
-    // Compute left over
-    for(; k < NUM_BINS_PER_BLOCK; ++k)
-    {
-        float val = ((__global float *)dst.ptr)[k] * scale;
-
-        // Clip scaled input_value if over L2_HYST_THRESHOLD
-        val = fmin(val, (float)L2_HYST_THRESHOLD);
-
-        sum += val * val;
-
-        ((__global float *)dst.ptr)[k] = val;
-    }
-
-    sum += dot(sum_f32, (float4)1.0f);
-
-    // We use the same constants of OpenCV
-    scale = 1.0f / (sqrt(sum) + 1e-3f);
-
-#endif /* (HOG_NORM_TYPE == L2HYS_NORM) */
-
-    int i = 0;
-    for(; i <= (NUM_BINS_PER_BLOCK - 16); i += 16)
-    {
-        float4 val0 = vload4(0, ((__global float *)dst.ptr) + i + 0);
-        float4 val1 = vload4(0, ((__global float *)dst.ptr) + i + 4);
-        float4 val2 = vload4(0, ((__global float *)dst.ptr) + i + 8);
-        float4 val3 = vload4(0, ((__global float *)dst.ptr) + i + 12);
-
-        // Multiply val by the normalization scale factor
-        val0 = val0 * (float4)scale;
-        val1 = val1 * (float4)scale;
-        val2 = val2 * (float4)scale;
-        val3 = val3 * (float4)scale;
-
-        vstore4(val0, 0, ((__global float *)dst.ptr) + i + 0);
-        vstore4(val1, 0, ((__global float *)dst.ptr) + i + 4);
-        vstore4(val2, 0, ((__global float *)dst.ptr) + i + 8);
-        vstore4(val3, 0, ((__global float *)dst.ptr) + i + 12);
-    }
-
-    for(; i < NUM_BINS_PER_BLOCK; ++i)
-    {
-        ((__global float *)dst.ptr)[i] *= scale;
-    }
-}
-#endif /* NUM_CELLS_PER_BLOCK_HEIGHT and NUM_BINS_PER_BLOCK_X and NUM_BINS_PER_BLOCK and HOG_NORM_TYPE and L2_HYST_THRESHOLD */
-
-#if defined(NUM_BLOCKS_PER_DESCRIPTOR_Y) && defined(NUM_BINS_PER_DESCRIPTOR_X) && defined(THRESHOLD) && defined(MAX_NUM_DETECTION_WINDOWS) && defined(IDX_CLASS) && defined(DETECTION_WINDOW_STRIDE_WIDTH) && defined(DETECTION_WINDOW_STRIDE_HEIGHT) && defined(DETECTION_WINDOW_WIDTH) && defined(DETECTION_WINDOW_HEIGHT)
-
-/** This OpenCL kernel computes the HOG detector using linear SVM
- *
- * @attention The following variables must be passed at compile time:
- *
- * -# -DNUM_BLOCKS_PER_DESCRIPTOR_Y = Number of blocks per descriptor along the Y direction
- * -# -DNUM_BINS_PER_DESCRIPTOR_X = Number of bins per descriptor along the X direction
- * -# -DTHRESHOLD = Threshold for the distance between features and SVM classifying plane
- * -# -DMAX_NUM_DETECTION_WINDOWS = Maximum number of possible detection windows. It is equal to the size of the DetectioWindow array
- * -# -DIDX_CLASS = Index of the class to detect
- * -# -DDETECTION_WINDOW_STRIDE_WIDTH = Detection window stride for the X direction
- * -# -DDETECTION_WINDOW_STRIDE_HEIGHT = Detection window stride for the Y direction
- * -# -DDETECTION_WINDOW_WIDTH = Width of the detection window
- * -# -DDETECTION_WINDOW_HEIGHT = Height of the detection window
- *
- * @note Each work-item computes a single detection window
- *
- * @param[in]  src_ptr                           Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  hog_descriptor                    Pointer to HOG descriptor. Supported data types: F32
- * @param[out] dst                               Pointer to DetectionWindow array
- * @param[out] num_detection_windows             Number of objects detected
- */
-__kernel void hog_detector(IMAGE_DECLARATION(src),
-                           __global float *hog_descriptor,
-                           __global DetectionWindow *dst,
-                           __global uint *num_detection_windows)
-{
-    // Check if the DetectionWindow array is full
-    if(*num_detection_windows >= MAX_NUM_DETECTION_WINDOWS)
-    {
-        return;
-    }
-
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-
-    const int src_step_y_f32 = src_stride_y / sizeof(float);
-
-    // Init score_f32 with 0
-    float4 score_f32 = (float4)0.0f;
-
-    // Init score with 0
-    float score = 0.0f;
-
-    __global float *src_row_ptr = (__global float *)src.ptr;
-
-    // Compute Linear SVM
-    for(int yb = 0; yb < NUM_BLOCKS_PER_DESCRIPTOR_Y; ++yb, src_row_ptr += src_step_y_f32)
-    {
-        int xb = 0;
-
-        const int offset_y = yb * NUM_BINS_PER_DESCRIPTOR_X;
-
-        for(; xb < (int)NUM_BINS_PER_DESCRIPTOR_X - 8; xb += 8)
-        {
-            // Load descriptor values
-            float4 a0_f32 = vload4(0, src_row_ptr + xb + 0);
-            float4 a1_f32 = vload4(0, src_row_ptr + xb + 4);
-
-            float4 b0_f32 = vload4(0, hog_descriptor + xb + 0 + offset_y);
-            float4 b1_f32 = vload4(0, hog_descriptor + xb + 4 + offset_y);
-
-            // Multiply accumulate
-            score_f32 += a0_f32 * b0_f32;
-            score_f32 += a1_f32 * b1_f32;
-        }
-
-        for(; xb < NUM_BINS_PER_DESCRIPTOR_X; ++xb)
-        {
-            const float a = src_row_ptr[xb];
-            const float b = hog_descriptor[xb + offset_y];
-
-            score += a * b;
-        }
-    }
-
-    score += dot(score_f32, (float4)1.0f);
-
-    // Add the bias. The bias is located at the position (descriptor_size() - 1)
-    // (descriptor_size - 1) = NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y
-    score += hog_descriptor[NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y];
-
-    if(score > (float)THRESHOLD)
-    {
-        int id = atomic_inc(num_detection_windows);
-        if(id < MAX_NUM_DETECTION_WINDOWS)
-        {
-            dst[id].x         = get_global_id(0) * DETECTION_WINDOW_STRIDE_WIDTH;
-            dst[id].y         = get_global_id(1) * DETECTION_WINDOW_STRIDE_HEIGHT;
-            dst[id].width     = DETECTION_WINDOW_WIDTH;
-            dst[id].height    = DETECTION_WINDOW_HEIGHT;
-            dst[id].idx_class = IDX_CLASS;
-            dst[id].score     = score;
-        }
-    }
-}
-#endif /* NUM_BLOCKS_PER_DESCRIPTOR_Y && NUM_BINS_PER_DESCRIPTOR_X && THRESHOLD && MAX_NUM_DETECTION_WINDOWS && IDX_CLASS &&
-        * DETECTION_WINDOW_STRIDE_WIDTH && DETECTION_WINDOW_STRIDE_HEIGHT && DETECTION_WINDOW_WIDTH && DETECTION_WINDOW_HEIGHT */

diff --git a/src/core/CL/cl_kernels/integral_image.cl b/src/core/CL/cl_kernels/integral_image.cl
deleted file mode 100644
index dd2c798..0000000
--- a/src/core/CL/cl_kernels/integral_image.cl
+++ /dev/null

@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function computes the horizontal integral of the image.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U32
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void integral_horizontal(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uint prev = 0;
-
-    for(uint j = 0; j < src_step_x; j += 16)
-    {
-        barrier(CLK_GLOBAL_MEM_FENCE);
-        uint16 res = convert_uint16(vload16(0, offset(&src, j, 0)));
-        res.s0 += prev;
-        res.s1 += res.s0;
-        res.s2 += res.s1;
-        res.s3 += res.s2;
-        res.s4 += res.s3;
-        res.s5 += res.s4;
-        res.s6 += res.s5;
-        res.s7 += res.s6;
-        res.s8 += res.s7;
-        res.s9 += res.s8;
-        res.sA += res.s9;
-        res.sB += res.sA;
-        res.sC += res.sB;
-        res.sD += res.sC;
-        res.sE += res.sD;
-        res.sF += res.sE;
-        prev = res.sF;
-        vstore16(res, 0, (__global uint *)offset(&dst, j, 0));
-    }
-}
-
-/** This function computes the vertical integral of the image.
- *
- * @param[in,out] src_ptr                           Pointer to the source image. Supported data types: U32
- * @param[in]     src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]     src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]     src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]     height                            Image height.
- */
-__kernel void integral_vertical(
-    IMAGE_DECLARATION(src),
-    uint height)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-
-    uint8 prev = vload8(0, (__global uint *)offset(&src, 0, 0));
-    for(uint j = 1; j < height; ++j)
-    {
-        barrier(CLK_GLOBAL_MEM_FENCE);
-        uint8 res = vload8(0, (__global uint *)offset(&src, 0, j));
-        res += prev;
-        vstore8(res, 0, (__global uint *)offset(&src, 0, j));
-        prev = res;
-    }
-}

diff --git a/src/core/CL/cl_kernels/magnitude_phase.cl b/src/core/CL/cl_kernels/magnitude_phase.cl
deleted file mode 100644
index 48197d6..0000000
--- a/src/core/CL/cl_kernels/magnitude_phase.cl
+++ /dev/null

@@ -1,162 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Calculates L1 normalization between two inputs.
- *
- * @param[in] a First input. Supported data types: S16, S32
- * @param[in] b Second input. Supported data types: S16, S32
- *
- * @return L1 normalization magnitude result. Supported data types: S16, S32
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 16) magnitude_l1(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b)
-{
-    return CONVERT_SAT(add_sat(abs(a), abs(b)), VEC_DATA_TYPE(DATA_TYPE, 16));
-}
-
-/** Calculates L2 normalization between two inputs.
- *
- * @param[in] a First input. Supported data types: S16, S32
- * @param[in] b Second input. Supported data types: S16, S32
- *
- * @return L2 normalization magnitude result. Supported data types: S16, S32
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 16) magnitude_l2(int16 a, int16 b)
-{
-    return CONVERT_SAT((sqrt(convert_float16((convert_uint16(a * a) + convert_uint16(b * b)))) + 0.5f),
-                       VEC_DATA_TYPE(DATA_TYPE, 16));
-}
-
-/** Calculates unsigned phase between two inputs.
- *
- * @param[in] a First input. Supported data types: S16, S32
- * @param[in] b Second input. Supported data types: S16, S32
- *
- * @return Unsigned phase mapped in the interval [0, 180]. Supported data types: U8
- */
-inline uchar16 phase_unsigned(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b)
-{
-    float16 angle_deg_f32 = atan2pi(convert_float16(b), convert_float16(a)) * (float16)180.0f;
-    angle_deg_f32         = select(angle_deg_f32, (float16)180.0f + angle_deg_f32, angle_deg_f32 < (float16)0.0f);
-    return convert_uchar16(angle_deg_f32);
-}
-
-/** Calculates signed phase between two inputs.
- *
- * @param[in] a First input. Supported data types: S16, S32
- * @param[in] b Second input. Supported data types: S16, S32
- *
- * @return Signed phase mapped in the interval [0, 256). Supported data types: U8
- */
-inline uchar16 phase_signed(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b)
-{
-    float16 arct = atan2pi(convert_float16(b), convert_float16(a));
-    arct         = select(arct, arct + 2, arct < 0.0f);
-
-    return convert_uchar16(convert_int16(mad(arct, 128, 0.5f)) & (int16)0xFFu);
-}
-
-#if(1 == MAGNITUDE)
-#define MAGNITUDE_OP(x, y) magnitude_l1((x), (y))
-#elif(2 == MAGNITUDE)
-#define MAGNITUDE_OP(x, y) magnitude_l2(convert_int16(x), convert_int16(y))
-#else /* MAGNITUDE */
-#define MAGNITUDE_OP(x, y)
-#endif /* MAGNITUDE */
-
-#if(1 == PHASE)
-#define PHASE_OP(x, y) phase_unsigned((x), (y))
-#elif(2 == PHASE)
-#define PHASE_OP(x, y) phase_signed((x), (y))
-#else /* PHASE */
-#define PHASE_OP(x, y)
-#endif /* PHASE */
-
-/** Calculate the magnitude and phase of given the gradients of an image.
- *
- * @note Magnitude calculation supported: L1 normalization(type = 1) and L2 normalization(type = 2).
- * @note Phase calculation supported: Unsigned(type = 1) [0,128] and Signed(type = 2) [0,256).
- *
- * @attention To enable phase calculation -DPHASE="phase_calculation_type_id" must be provided at compile time. eg -DPHASE=1
- * @attention To enable magnitude calculation -DMAGNITUDE="magnitude_calculation_type_id" must be provided at compile time. eg -DMAGNITUDE=1
- * @attention Datatype of the two inputs is passed at compile time using -DDATA_TYPE. e.g -DDATA_TYPE=short. Supported data_types are: short and int
- *
- * @param[in]  gx_ptr                                  Pointer to the first source image (gradient X). Supported data types: S16, S32
- * @param[in]  gx_stride_x                             Stride of the source image in X dimension (in bytes)
- * @param[in]  gx_step_x                               gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  gx_stride_y                             Stride of the source image in Y dimension (in bytes)
- * @param[in]  gx_step_y                               gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  gx_offset_first_element_in_bytes        The offset of the first element in the source image
- * @param[in]  gy_ptr                                  Pointer to the second source image (gradient Y) . Supported data types: S16, S32
- * @param[in]  gy_stride_x                             Stride of the destination image in X dimension (in bytes)
- * @param[in]  gy_step_x                               gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  gy_stride_y                             Stride of the destination image in Y dimension (in bytes)
- * @param[in]  gy_step_y                               gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  gy_offset_first_element_in_bytes        The offset of the first element in the destination image
- * @param[out] magnitude_ptr                           Pointer to the magnitude destination image. Supported data types: S16, S32
- * @param[in]  magnitude_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  magnitude_step_x                        magnitude_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  magnitude_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  magnitude_step_y                        magnitude_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  magnitude_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] phase_ptr                               Pointer to the phase destination image. Supported data types: U8
- * @param[in]  phase_stride_x                          Stride of the destination image in X dimension (in bytes)
- * @param[in]  phase_step_x                            phase_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  phase_stride_y                          Stride of the destination image in Y dimension (in bytes)
- * @param[in]  phase_step_y                            phase_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  phase_offset_first_element_in_bytes     The offset of the first element in the destination image
- * */
-__kernel void magnitude_phase(
-    IMAGE_DECLARATION(gx),
-    IMAGE_DECLARATION(gy)
-#ifdef MAGNITUDE
-    ,
-    IMAGE_DECLARATION(magnitude)
-#endif /* MAGNITUDE */
-#ifdef PHASE
-    ,
-    IMAGE_DECLARATION(phase)
-#endif /* PHASE */
-)
-{
-    // Get pixels pointer
-    Image gx = CONVERT_TO_IMAGE_STRUCT(gx);
-    Image gy = CONVERT_TO_IMAGE_STRUCT(gy);
-
-    // Load values
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    in_a = vload16(0, (__global DATA_TYPE *)gx.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    in_b = vload16(0, (__global DATA_TYPE *)gy.ptr);
-
-    // Calculate and store the results
-#ifdef MAGNITUDE
-    Image magnitude = CONVERT_TO_IMAGE_STRUCT(magnitude);
-    vstore16(MAGNITUDE_OP(in_a, in_b), 0, (__global DATA_TYPE *)magnitude.ptr);
-#endif /* MAGNITUDE */
-#ifdef PHASE
-    Image phase = CONVERT_TO_IMAGE_STRUCT(phase);
-    vstore16(PHASE_OP(in_a, in_b), 0, phase.ptr);
-#endif /* PHASE */
-}

diff --git a/src/core/CL/cl_kernels/mean_stddev.cl b/src/core/CL/cl_kernels/mean_stddev.cl
deleted file mode 100644
index 4ddf931..0000000
--- a/src/core/CL/cl_kernels/mean_stddev.cl
+++ /dev/null

@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2016-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-
-/** This function calculates the sum and sum of squares of a given input image.
- *
- * @note To enable calculation sum of squares -DSTDDEV should be passed as a preprocessor argument.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  height                            Height of the input image
- * @param[out] global_sum                        Global sum of all elements
- * @param[out] global_sum_sq                     Global sum of squares of all elements
- */
-__kernel void mean_stddev_accumulate(
-    IMAGE_DECLARATION(src),
-    uint     height,
-    __global ulong *global_sum
-#ifdef STDDEV
-    ,
-    __global ulong *global_sum_sq
-#endif /* STDDEV */
-)
-{
-    // Get pixels pointer
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-
-    uint8 tmp_sum = 0;
-#ifdef STDDEV
-    uint8 tmp_sum_sq = 0;
-#endif /* STDDEV */
-    // Calculate partial sum
-    for(int i = 0; i < height; i++)
-    {
-        // Load data
-        uint8 data = convert_uint8(vload8(0, offset(&src, 0, i)));
-
-        tmp_sum += data;
-#ifdef STDDEV
-        tmp_sum_sq += data * data;
-#endif /* STDDEV */
-    }
-    // Perform reduction
-    tmp_sum.s0123 += tmp_sum.s4567;
-    tmp_sum.s01 += tmp_sum.s23;
-    atom_add(global_sum, tmp_sum.s0 + tmp_sum.s1);
-
-#ifdef STDDEV
-    tmp_sum_sq.s0123 += tmp_sum_sq.s4567;
-    tmp_sum_sq.s01 += tmp_sum_sq.s23;
-    atom_add(global_sum_sq, tmp_sum_sq.s0 + tmp_sum_sq.s1);
-#endif /* STDDEV */
-}
-
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : disable

diff --git a/src/core/CL/cl_kernels/minmaxloc.cl b/src/core/CL/cl_kernels/minmaxloc.cl
deleted file mode 100644
index 1045f22..0000000
--- a/src/core/CL/cl_kernels/minmaxloc.cl
+++ /dev/null

@@ -1,193 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "types.h"
-
-#ifndef DATA_TYPE_MIN
-#define DATA_TYPE_MIN 0x0
-#endif /* DATA_TYPE_MIN */
-
-#ifndef DATA_TYPE_MAX
-#define DATA_TYPE_MAX 0xFF
-#endif /* DATA_TYPE_MAX */
-
-inline int FloatFlip(float val)
-{
-    union
-    {
-        int   int_val;
-        float flt_val;
-    } u_val;
-    u_val.flt_val = val;
-    return (u_val.int_val >= 0) ? u_val.int_val : u_val.int_val ^ 0x7FFFFFFF;
-}
-
-__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(DATA_TYPE_MIN);
-__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_max = (VEC_DATA_TYPE(DATA_TYPE, 16))(DATA_TYPE_MAX);
-__constant int16 idx16 = (int16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-
-/** This function identifies the min and maximum value of an input image.
- *
- * @note Input image data type must be passed as a preprocessor argument using -DDATA_TYPE.
- * Moreover, the minimum and maximum value of the given data type must be provided using -DDATA_TYPE_MIN and -DDATA_TYPE_MAX respectively.
- * @note In case image width is not a multiple of 16 then -DNON_MULTIPLE_OF_16 must be passed.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] min_max                           Pointer to buffer with minimum value in position 0 and maximum value in position 1
- * @param[in]  width                             Input image width
- */
-__kernel void minmax(
-    IMAGE_DECLARATION(src),
-    __global int *min_max,
-    int           width)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-
-    // Initialize local minimum and local maximum
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    local_min = type_max;
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    local_max = type_min;
-
-    // Calculate min/max of row
-    int i = 0;
-    for(; i + 16 <= width; i += 16)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 16)
-        data      = vload16(0, (__global DATA_TYPE *)offset(&src, i, 0));
-        local_min = min(data, local_min);
-        local_max = max(data, local_max);
-    }
-
-#ifdef NON_MULTIPLE_OF_16
-    // Handle non multiple of 16
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    data = vload16(0, (__global DATA_TYPE *)offset(&src, i, 0));
-#ifdef IS_DATA_TYPE_FLOAT
-    int16 valid_indices = (i + idx16) < width;
-#else  /* IS_DATA_TYPE_FLOAT */
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    valid_indices = CONVERT((i + idx16) < width, VEC_DATA_TYPE(DATA_TYPE, 16));
-#endif /* IS_DATA_TYPE_FLOAT */
-    local_max = max(local_max, select(type_min, data, valid_indices));
-    local_min = min(local_min, select(type_max, data, valid_indices));
-#endif /* NON_MULTIPLE_OF_16 */
-
-    // Perform min/max reduction
-    local_min.s01234567 = min(local_min.s01234567, local_min.s89ABCDEF);
-    local_max.s01234567 = max(local_max.s01234567, local_max.s89ABCDEF);
-
-    local_min.s0123 = min(local_min.s0123, local_min.s4567);
-    local_max.s0123 = max(local_max.s0123, local_max.s4567);
-
-    local_min.s01 = min(local_min.s01, local_min.s23);
-    local_max.s01 = max(local_max.s01, local_max.s23);
-
-    local_min.s0 = min(local_min.s0, local_min.s1);
-    local_max.s0 = max(local_max.s0, local_max.s1);
-
-    // Update global min/max
-#ifdef IS_DATA_TYPE_FLOAT
-    atomic_min(&min_max[0], FloatFlip(local_min.s0));
-    atomic_max(&min_max[1], FloatFlip(local_max.s0));
-#else  /* IS_DATA_TYPE_FLOAT */
-    atomic_min(&min_max[0], local_min.s0);
-    atomic_max(&min_max[1], local_max.s0);
-#endif /* IS_DATA_TYPE_FLOAT */
-}
-
-/** This function counts the min and max occurrences in an image and tags their position.
- *
- * @note -DCOUNT_MIN_MAX should be specified if we want to count the occurrences of the minimum and maximum values.
- * @note -DLOCATE_MIN and/or -DLOCATE_MAX should be specified if we want to store the position of each occurrence on the given array.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  min_max                           Pointer to buffer with minimum value in position 0 and maximum value in position 1
- * @param[out] min_max_count                     Pointer to buffer with minimum value occurrences in position 0 and maximum value occurrences in position 1
- * @param[out] min_loc                           Array that holds the location of the minimum value occurrences
- * @param[in]  max_min_loc_count                 The maximum number of min value occurrences coordinates the array can hold
- * @param[out] max_loc                           Array that holds the location of the maximum value occurrences
- * @param[in]  max_max_loc_count                 The maximum number of max value occurrences coordinates the array can hold
- */
-__kernel void minmaxloc(
-    IMAGE_DECLARATION(src),
-    __global int *min_max,
-    __global uint *min_max_count
-#ifdef LOCATE_MIN
-    ,
-    __global Coordinates2D *min_loc, uint max_min_loc_count
-#endif /* LOCATE_MIN */
-#ifdef LOCATE_MAX
-    ,
-    __global Coordinates2D *max_loc, uint max_max_loc_count
-#endif /* LOCATE_MAX */
-)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-
-#ifdef IS_DATA_TYPE_FLOAT
-    __global float *min_max_ptr = (__global float *)min_max;
-    float           min_value   = min_max_ptr[0];
-    float           max_value   = min_max_ptr[1];
-#else  /* IS_DATA_TYPE_FLOAT */
-    int min_value = min_max[0];
-    int max_value = min_max[1];
-#endif /* IS_DATA_TYPE_FLOAT */
-
-    DATA_TYPE value = *((__global DATA_TYPE *)src.ptr);
-#ifdef COUNT_MIN_MAX
-    if(value == min_value)
-    {
-        uint idx = atomic_inc(&min_max_count[0]);
-#ifdef LOCATE_MIN
-        if(idx < max_min_loc_count)
-        {
-            min_loc[idx].x = get_global_id(0);
-            min_loc[idx].y = get_global_id(1);
-        }
-#endif /* LOCATE_MIN */
-    }
-    if(value == max_value)
-    {
-        uint idx = atomic_inc(&min_max_count[1]);
-#ifdef LOCATE_MAX
-        if(idx < max_max_loc_count)
-        {
-            max_loc[idx].x = get_global_id(0);
-            max_loc[idx].y = get_global_id(1);
-        }
-#endif /* LOCATE_MAX */
-    }
-#endif /* COUNT_MIN_MAX */
-}

diff --git a/src/core/CL/cl_kernels/non_linear_filter3x3.cl b/src/core/CL/cl_kernels/non_linear_filter3x3.cl
deleted file mode 100644
index 93c5024..0000000
--- a/src/core/CL/cl_kernels/non_linear_filter3x3.cl
+++ /dev/null

@@ -1,186 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "non_linear_filter_helpers.h"
-
-/** This function applies a non linear filter on a 3x3 box basis on an input image.
- *
- * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void non_linear_filter_box3x3(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values
-    uchar16 top    = vload16(0, offset(&src, -1, -1));
-    uchar16 middle = vload16(0, offset(&src, -1, 0));
-    uchar16 bottom = vload16(0, offset(&src, -1, 1));
-
-    // Apply respective filter
-#ifdef MIN
-    uchar16 tmp = min(top, min(middle, bottom));
-    uchar8  out = row_reduce_min_3(tmp);
-#elif defined(MAX)
-    uchar16 tmp = max(top, max(middle, bottom));
-    uchar8  out = row_reduce_max_3(tmp);
-#elif defined(MEDIAN)
-    uchar8 p0  = top.s01234567;
-    uchar8 p1  = top.s12345678;
-    uchar8 p2  = top.s23456789;
-    uchar8 p3  = middle.s01234567;
-    uchar8 p4  = middle.s12345678;
-    uchar8 p5  = middle.s23456789;
-    uchar8 p6  = bottom.s01234567;
-    uchar8 p7  = bottom.s12345678;
-    uchar8 p8  = bottom.s23456789;
-    uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
-#else /* MIN or MAX or MEDIAN */
-#error "Unsupported filter function"
-#endif /* MIN or MAX or MEDIAN */
-
-    // Store result
-    vstore8(out, 0, dst.ptr);
-}
-
-/** This function applies a non linear filter on a 3x3 cross basis on an input image.
- *
- * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void non_linear_filter_cross3x3(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values
-    uchar8  top    = vload8(0, offset(&src, 0, -1));
-    uchar16 middle = vload16(0, offset(&src, -1, 0));
-    uchar8  bottom = vload8(0, offset(&src, 0, 1));
-
-    // Apply respective filter
-#ifdef MIN
-    uchar8 tmp_middle = row_reduce_min_3(middle);
-    uchar8 out        = min(tmp_middle, min(top, bottom));
-#elif defined(MAX)
-    uchar8  tmp_middle = row_reduce_max_3(middle);
-    uchar8  out        = max(tmp_middle, max(top, bottom));
-#elif defined(MEDIAN)
-    uchar8 p0  = top.s01234567;
-    uchar8 p1  = middle.s01234567;
-    uchar8 p2  = middle.s12345678;
-    uchar8 p3  = middle.s23456789;
-    uchar8 p4  = bottom.s01234567;
-    uchar8 out = sort5(p0, p1, p2, p3, p4);
-#else /* MIN or MAX or MEDIAN */
-#error "Unsupported filter function"
-#endif /* MIN or MAX or MEDIAN */
-
-    // Store result
-    vstore8(out, 0, dst.ptr);
-}
-
-/** This function applies a non linear filter on a 3x3 disk basis on an input image.
- *
- * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void non_linear_filter_disk3x3(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values
-    uchar16 top    = vload16(0, offset(&src, -1, -1));
-    uchar16 middle = vload16(0, offset(&src, -1, 0));
-    uchar16 bottom = vload16(0, offset(&src, -1, 1));
-
-    // Apply respective filter
-#ifdef MIN
-    uchar16 tmp = min(top, min(middle, bottom));
-    uchar8  out = row_reduce_min_3(tmp);
-#elif defined(MAX)
-    uchar16 tmp        = max(top, max(middle, bottom));
-    uchar8  out        = row_reduce_max_3(tmp);
-#elif defined(MEDIAN)
-    uchar8 p0  = top.s01234567;
-    uchar8 p1  = top.s12345678;
-    uchar8 p2  = top.s23456789;
-    uchar8 p3  = middle.s01234567;
-    uchar8 p4  = middle.s12345678;
-    uchar8 p5  = middle.s23456789;
-    uchar8 p6  = bottom.s01234567;
-    uchar8 p7  = bottom.s12345678;
-    uchar8 p8  = bottom.s23456789;
-    uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
-#else /* MIN or MAX or MEDIAN */
-#error "Unsupported filter function"
-#endif /* MIN or MAX or MEDIAN */
-
-    // Store result
-    vstore8(out, 0, dst.ptr);
-}

diff --git a/src/core/CL/cl_kernels/non_linear_filter5x5.cl b/src/core/CL/cl_kernels/non_linear_filter5x5.cl
deleted file mode 100644
index 7c87284..0000000
--- a/src/core/CL/cl_kernels/non_linear_filter5x5.cl
+++ /dev/null

@@ -1,483 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "non_linear_filter_helpers.h"
-
-// Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html
-
-/** Sorting network to sort 8 disks of diameter 5 and return their median.
- *
- * @param[in] top2    Values of elements two rows above.
- * @param[in] top     Values of elements one row above.
- * @param[in] middle  Values of middle elements.
- * @param[in] bottom  Values of elements one row below.
- * @param[in] bottom2 Values of elements two rows below.
- *
- * @return Median values for 8 elements.
- */
-inline uchar8 median_disk5x5(uchar16 top2, uchar16 top, uchar16 middle, uchar16 bottom, uchar16 bottom2)
-{
-    uchar8 p0  = top2.s01234567;
-    uchar8 p1  = top2.s12345678;
-    uchar8 p2  = top2.s23456789;
-    uchar8 p3  = top.s01234567;
-    uchar8 p4  = top.s12345678;
-    uchar8 p5  = top.s23456789;
-    uchar8 p6  = top.s3456789A;
-    uchar8 p7  = top.s456789AB;
-    uchar8 p8  = middle.s01234567;
-    uchar8 p9  = middle.s12345678;
-    uchar8 p10 = middle.s23456789;
-    uchar8 p11 = middle.s3456789A;
-    uchar8 p12 = middle.s456789AB;
-    uchar8 p13 = bottom.s01234567;
-    uchar8 p14 = bottom.s12345678;
-    uchar8 p15 = bottom.s23456789;
-    uchar8 p16 = bottom.s3456789A;
-    uchar8 p17 = bottom.s456789AB;
-    uchar8 p18 = bottom2.s01234567;
-    uchar8 p19 = bottom2.s12345678;
-    uchar8 p20 = bottom2.s23456789;
-
-    SORT(p0, p1);
-    SORT(p2, p3);
-    SORT(p4, p5);
-    SORT(p6, p7);
-    SORT(p8, p9);
-    SORT(p10, p11);
-    SORT(p12, p13);
-    SORT(p14, p15);
-    SORT(p16, p17);
-    SORT(p18, p19);
-    SORT(p0, p2);
-    SORT(p1, p3);
-    SORT(p4, p6);
-    SORT(p5, p7);
-    SORT(p8, p10);
-    SORT(p9, p11);
-    SORT(p12, p14);
-    SORT(p13, p15);
-    SORT(p16, p18);
-    SORT(p17, p19);
-    SORT(p1, p2);
-    SORT(p5, p6);
-    SORT(p0, p4);
-    SORT(p3, p7);
-    SORT(p9, p10);
-    SORT(p13, p14);
-    SORT(p8, p12);
-    SORT(p11, p15);
-    SORT(p17, p18);
-    SORT(p16, p20);
-    SORT(p1, p5);
-    SORT(p2, p6);
-    SORT(p9, p13);
-    SORT(p10, p14);
-    SORT(p0, p8);
-    SORT(p7, p15);
-    SORT(p17, p20);
-    SORT(p1, p4);
-    SORT(p3, p6);
-    SORT(p9, p12);
-    SORT(p11, p14);
-    SORT(p18, p20);
-    SORT(p0, p16);
-    SORT(p2, p4);
-    SORT(p3, p5);
-    SORT(p10, p12);
-    SORT(p11, p13);
-    SORT(p1, p9);
-    SORT(p6, p14);
-    SORT(p19, p20);
-    SORT(p3, p4);
-    SORT(p11, p12);
-    SORT(p1, p8);
-    SORT(p2, p10);
-    SORT(p5, p13);
-    SORT(p7, p14);
-    SORT(p3, p11);
-    SORT(p2, p8);
-    SORT(p4, p12);
-    SORT(p7, p13);
-    SORT(p1, p17);
-    SORT(p3, p10);
-    SORT(p5, p12);
-    SORT(p1, p16);
-    SORT(p2, p18);
-    SORT(p3, p9);
-    SORT(p6, p12);
-    SORT(p2, p16);
-    SORT(p3, p8);
-    SORT(p7, p12);
-    SORT(p5, p9);
-    SORT(p6, p10);
-    SORT(p4, p8);
-    SORT(p7, p11);
-    SORT(p3, p19);
-    SORT(p5, p8);
-    SORT(p7, p10);
-    SORT(p3, p18);
-    SORT(p4, p20);
-    SORT(p6, p8);
-    SORT(p7, p9);
-    SORT(p3, p17);
-    SORT(p5, p20);
-    SORT(p7, p8);
-    SORT(p3, p16);
-    SORT(p6, p20);
-    SORT(p5, p17);
-    SORT(p7, p20);
-    SORT(p4, p16);
-    SORT(p6, p18);
-    SORT(p5, p16);
-    SORT(p7, p19);
-    SORT(p7, p18);
-    SORT(p6, p16);
-    SORT(p7, p17);
-    SORT(p10, p18);
-    SORT(p7, p16);
-    SORT(p9, p17);
-    SORT(p8, p16);
-    SORT(p9, p16);
-    SORT(p10, p16);
-
-    return p10;
-}
-
-/** Sorting network to sort 8 boxes of size 5 and return their median.
- *
- * @param[in] top2    Values of elements two rows above.
- * @param[in] top     Values of elements one row above.
- * @param[in] middle  Values of middle elements.
- * @param[in] bottom  Values of elements one row below.
- * @param[in] bottom2 Values of elements two rows below.
- *
- * @return Median values for 8 elements.
- */
-inline uchar8 median_box5x5(uchar16 top2, uchar16 top, uchar16 middle, uchar16 bottom, uchar16 bottom2)
-{
-    uchar8 p0  = top2.s01234567;
-    uchar8 p1  = top2.s12345678;
-    uchar8 p2  = top2.s23456789;
-    uchar8 p3  = top2.s3456789A;
-    uchar8 p4  = top2.s456789AB;
-    uchar8 p5  = top.s01234567;
-    uchar8 p6  = top.s12345678;
-    uchar8 p7  = top.s23456789;
-    uchar8 p8  = top.s3456789A;
-    uchar8 p9  = top.s456789AB;
-    uchar8 p10 = middle.s01234567;
-    uchar8 p11 = middle.s12345678;
-    uchar8 p12 = middle.s23456789;
-    uchar8 p13 = middle.s3456789A;
-    uchar8 p14 = middle.s456789AB;
-    uchar8 p15 = bottom.s01234567;
-    uchar8 p16 = bottom.s12345678;
-    uchar8 p17 = bottom.s23456789;
-    uchar8 p18 = bottom.s3456789A;
-    uchar8 p19 = bottom.s456789AB;
-    uchar8 p20 = bottom2.s01234567;
-    uchar8 p21 = bottom2.s12345678;
-    uchar8 p22 = bottom2.s23456789;
-    uchar8 p23 = bottom2.s3456789A;
-    uchar8 p24 = bottom2.s456789AB;
-
-    SORT(p1, p2);
-    SORT(p0, p1);
-    SORT(p1, p2);
-    SORT(p4, p5);
-    SORT(p3, p4);
-    SORT(p4, p5);
-    SORT(p0, p3);
-    SORT(p2, p5);
-    SORT(p2, p3);
-    SORT(p1, p4);
-    SORT(p1, p2);
-    SORT(p3, p4);
-    SORT(p7, p8);
-    SORT(p6, p7);
-    SORT(p7, p8);
-    SORT(p10, p11);
-    SORT(p9, p10);
-    SORT(p10, p11);
-    SORT(p6, p9);
-    SORT(p8, p11);
-    SORT(p8, p9);
-    SORT(p7, p10);
-    SORT(p7, p8);
-    SORT(p9, p10);
-    SORT(p0, p6);
-    SORT(p4, p10);
-    SORT(p4, p6);
-    SORT(p2, p8);
-    SORT(p2, p4);
-    SORT(p6, p8);
-    SORT(p1, p7);
-    SORT(p5, p11);
-    SORT(p5, p7);
-    SORT(p3, p9);
-    SORT(p3, p5);
-    SORT(p7, p9);
-    SORT(p1, p2);
-    SORT(p3, p4);
-    SORT(p5, p6);
-    SORT(p7, p8);
-    SORT(p9, p10);
-    SORT(p13, p14);
-    SORT(p12, p13);
-    SORT(p13, p14);
-    SORT(p16, p17);
-    SORT(p15, p16);
-    SORT(p16, p17);
-    SORT(p12, p15);
-    SORT(p14, p17);
-    SORT(p14, p15);
-    SORT(p13, p16);
-    SORT(p13, p14);
-    SORT(p15, p16);
-    SORT(p19, p20);
-    SORT(p18, p19);
-    SORT(p19, p20);
-    SORT(p21, p22);
-    SORT(p23, p24);
-    SORT(p21, p23);
-    SORT(p22, p24);
-    SORT(p22, p23);
-    SORT(p18, p21);
-    SORT(p20, p23);
-    SORT(p20, p21);
-    SORT(p19, p22);
-    SORT(p22, p24);
-    SORT(p19, p20);
-    SORT(p21, p22);
-    SORT(p23, p24);
-    SORT(p12, p18);
-    SORT(p16, p22);
-    SORT(p16, p18);
-    SORT(p14, p20);
-    SORT(p20, p24);
-    SORT(p14, p16);
-    SORT(p18, p20);
-    SORT(p22, p24);
-    SORT(p13, p19);
-    SORT(p17, p23);
-    SORT(p17, p19);
-    SORT(p15, p21);
-    SORT(p15, p17);
-    SORT(p19, p21);
-    SORT(p13, p14);
-    SORT(p15, p16);
-    SORT(p17, p18);
-    SORT(p19, p20);
-    SORT(p21, p22);
-    SORT(p23, p24);
-    SORT(p0, p12);
-    SORT(p8, p20);
-    SORT(p8, p12);
-    SORT(p4, p16);
-    SORT(p16, p24);
-    SORT(p12, p16);
-    SORT(p2, p14);
-    SORT(p10, p22);
-    SORT(p10, p14);
-    SORT(p6, p18);
-    SORT(p6, p10);
-    SORT(p10, p12);
-    SORT(p1, p13);
-    SORT(p9, p21);
-    SORT(p9, p13);
-    SORT(p5, p17);
-    SORT(p13, p17);
-    SORT(p3, p15);
-    SORT(p11, p23);
-    SORT(p11, p15);
-    SORT(p7, p19);
-    SORT(p7, p11);
-    SORT(p11, p13);
-    SORT(p11, p12);
-    return p12;
-}
-
-/** This function applies a non linear filter on a 5x5 box basis on an input image.
- *
- * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void non_linear_filter_box5x5(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values
-    uchar16 top2    = vload16(0, offset(&src, -2, -2));
-    uchar16 top     = vload16(0, offset(&src, -2, -1));
-    uchar16 middle  = vload16(0, offset(&src, -2, 0));
-    uchar16 bottom  = vload16(0, offset(&src, -2, 1));
-    uchar16 bottom2 = vload16(0, offset(&src, -2, 2));
-
-    // Apply respective filter
-#ifdef MIN
-    uchar16 tmp = min(middle, min(min(top2, top), min(bottom, bottom2)));
-    uchar8  out = row_reduce_min_5(tmp);
-#elif defined(MAX)
-    uchar16 tmp = max(middle, max(max(top2, top), max(bottom, bottom2)));
-    uchar8  out = row_reduce_max_5(tmp);
-#elif defined(MEDIAN)
-    uchar8 out = median_box5x5(top2, top, middle, bottom, bottom2);
-#else /* MIN or MAX or MEDIAN */
-#error "Unsupported filter function"
-#endif /* MIN or MAX or MEDIAN */
-
-    // Store result
-    vstore8(out, 0, dst.ptr);
-}
-
-/** This function applies a non linear filter on a 5x5 cross basis on an input image.
- *
- * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void non_linear_filter_cross5x5(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values
-    uchar8  top2    = vload8(0, offset(&src, 0, -2));
-    uchar8  top     = vload8(0, offset(&src, 0, -1));
-    uchar16 middle  = vload16(0, offset(&src, -2, 0));
-    uchar8  bottom  = vload8(0, offset(&src, 0, 1));
-    uchar8  bottom2 = vload8(0, offset(&src, 0, 2));
-
-    // Apply respective filter
-#ifdef MIN
-    uchar8 tmp_middle = row_reduce_min_5(middle);
-    uchar8 out        = min(tmp_middle, min(min(top2, top), min(bottom, bottom2)));
-#elif defined(MAX)
-    uchar8  tmp_middle = row_reduce_max_5(middle);
-    uchar8  out        = max(tmp_middle, max(max(top2, top.s01234567), max(bottom, bottom2)));
-#elif defined(MEDIAN)
-    uchar8 p0  = top2;
-    uchar8 p1  = top;
-    uchar8 p2  = middle.s01234567;
-    uchar8 p3  = middle.s12345678;
-    uchar8 p4  = middle.s23456789;
-    uchar8 p5  = middle.s3456789A;
-    uchar8 p6  = middle.s456789AB;
-    uchar8 p7  = bottom;
-    uchar8 p8  = bottom2;
-    uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
-#else /* MIN or MAX or MEDIAN */
-#error "Unsupported filter function"
-#endif /* MIN or MAX or MEDIAN */
-
-    // Store result
-    vstore8(out, 0, dst.ptr);
-}
-
-/** This function applies a non linear filter on a 5x5 disk basis on an input image.
- *
- * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void non_linear_filter_disk5x5(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values
-    uchar16 top2    = vload16(0, offset(&src, -2, -2));
-    uchar16 top     = vload16(0, offset(&src, -2, -1));
-    uchar16 middle  = vload16(0, offset(&src, -2, 0));
-    uchar16 bottom  = vload16(0, offset(&src, -2, 1));
-    uchar16 bottom2 = vload16(0, offset(&src, -2, 2));
-
-    // Shift top2 and bottom2 values
-    top2    = top2.s123456789ABCDEFF;
-    bottom2 = bottom2.s123456789ABCDEFF;
-
-    // Apply respective filter
-#ifdef MIN
-    uchar16 tmp_3     = min(top2, bottom2);
-    uchar16 tmp_5     = min(middle, min(top, bottom));
-    uchar8  tmp_3_red = row_reduce_min_3(tmp_3);
-    uchar8  tmp_5_red = row_reduce_min_5(tmp_5);
-    uchar8  out       = min(tmp_3_red, tmp_5_red);
-#elif defined(MAX)
-    uchar16 tmp_3      = max(top2, bottom2);
-    uchar16 tmp_5      = max(middle, max(top, bottom));
-    uchar8  tmp_3_red  = row_reduce_max_3(tmp_3);
-    uchar8  tmp_5_red  = row_reduce_max_5(tmp_5);
-    uchar8  out        = max(tmp_3_red, tmp_5_red);
-#elif defined(MEDIAN)
-    uchar8 out = median_disk5x5(top2, top, middle, bottom, bottom2);
-#else /* MIN or MAX or MEDIAN */
-#error "Unsupported filter function"
-#endif /* MIN or MAX or MEDIAN */
-
-    // Store result
-    vstore8(out, 0, dst.ptr);
-}

diff --git a/src/core/CL/cl_kernels/non_linear_filter_helpers.h b/src/core/CL/cl_kernels/non_linear_filter_helpers.h
deleted file mode 100644
index 3fcfad4..0000000
--- a/src/core/CL/cl_kernels/non_linear_filter_helpers.h
+++ /dev/null

@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/** Sorts element-wise two vectors.
- *
- * @param[in, out] a First vector
- * @param[in, out] b Second vector
- */
-#define SORT(a, b)                  \
-    {                               \
-        uchar8 min_val = min(a, b); \
-        uchar8 max_val = max(a, b); \
-        a              = min_val;   \
-        b              = max_val;   \
-    }
-
-// Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html
-
-/** Sorting network to sort 5 vectors of 8 elements and return their median.
- *
- * @param[in] p0 First element vector
- * @param[in] p1 Second element vector
- * @param[in] p2 Third element vector
- * @param[in] p3 Fourth element vector
- * @param[in] p4 Fifth element vector
- *
- * @return Median values for 8 elements.
- */
-inline uchar8 sort5(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4)
-{
-    SORT(p0, p1);
-    SORT(p2, p3);
-    SORT(p0, p2);
-    SORT(p1, p3);
-    SORT(p1, p2);
-    SORT(p0, p4);
-    SORT(p1, p4);
-    SORT(p2, p4);
-
-    return p2;
-}
-
-/** Sorting network to sort 9 vectors of 8 elements and return their median.
- *
- * @param[in] p0 First element vector
- * @param[in] p1 Second element vector
- * @param[in] p2 Third element vector
- * @param[in] p3 Fourth element vector
- * @param[in] p4 Fifth element vector
- * @param[in] p5 Sixth element vector
- * @param[in] p6 Seventh element vector
- * @param[in] p7 Eigth element vector
- * @param[in] p8 Ninth element vector
- *
- * @return Median values for 8 elements.
- */
-inline uchar8 sort9(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4, uchar8 p5, uchar8 p6, uchar8 p7, uchar8 p8)
-{
-    SORT(p1, p2);
-    SORT(p4, p5);
-    SORT(p7, p8);
-    SORT(p0, p1);
-    SORT(p3, p4);
-    SORT(p6, p7);
-    SORT(p1, p2);
-    SORT(p4, p5);
-    SORT(p7, p8);
-    SORT(p0, p3);
-    SORT(p5, p8);
-    SORT(p4, p7);
-    SORT(p3, p6);
-    SORT(p1, p4);
-    SORT(p2, p5);
-    SORT(p4, p7);
-    SORT(p4, p2);
-    SORT(p6, p4);
-    SORT(p4, p2);
-
-    return p4;
-}
-
-/** Calculate the minimum of a sliding window of size 3.
- *
- * @param val Values to calculate the minimum values
- *
- * @return Minimum values of 8 elements on a sliding window of size 3.
- */
-inline uchar8 row_reduce_min_3(uchar16 val)
-{
-    return min(val.s01234567, min(val.s12345678, val.s23456789));
-}
-
-/** Calculate the maximum of a sliding window of size 3.
- *
- * @param val Values to calculate the maximum values
- *
- * @return Maximum values of 8 elements on a sliding window of size 3.
- */
-inline uchar8 row_reduce_max_3(uchar16 val)
-{
-    return max(val.s01234567, max(val.s12345678, val.s23456789));
-}
-
-/** Calculate the minimum of a sliding window of size 5.
- *
- * @param val Values to calculate the minimum values
- *
- * @return Minimum values of 8 elements on a sliding window of size 5.
- */
-inline uchar8 row_reduce_min_5(uchar16 val)
-{
-    return min(val.s01234567, min(min(val.s12345678, val.s23456789), min(val.s3456789A, val.s456789AB)));
-}
-
-/** Calculate the maximum of a sliding window of size 5.
- *
- * @param val Values to calculate the maximum values
- *
- * @return Maximum values of 8 elements on a sliding window of size 5.
- */
-inline uchar8 row_reduce_max_5(uchar16 val)
-{
-    return max(val.s01234567, max(max(val.s12345678, val.s23456789), max(val.s3456789A, val.s456789AB)));
-}

diff --git a/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl b/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl
deleted file mode 100644
index 9bbde1a..0000000
--- a/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl
+++ /dev/null

@@ -1,521 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "types.h"
-
-/*
- *The criteria for lost tracking is that the spatial gradient matrix has:
- * - Determinant less than DETERMINANT_THR
- * - or minimum eigenvalue is smaller then EIGENVALUE_THR
- *
- * The thresholds for the determinant and the minimum eigenvalue is
- * defined by the OpenVX spec
- *
- * Note: Also lost tracking happens when the point tracked coordinate is outside
- * the image coordinates
- *
- * https://www.khronos.org/registry/vx/specs/1.0/html/d0/d0c/group__group__vision__function__opticalflowpyrlk.html
- */
-
-/* Internal Lucas-Kanade Keypoint struct */
-typedef struct InternalKeypoint
-{
-    float x;               /**< The x coordinate. */
-    float y;               /**< The y coordinate. */
-    float tracking_status; /**< A zero indicates a lost point. Initialized to 1 by corner detectors. */
-    float dummy;           /**< Dummy member for alignment. */
-} InternalKeypoint;
-
-/** Threshold for the determinant. Used for lost tracking criteria */
-#define DETERMINANT_THR 1.0e-07f
-
-/** Thresholds for minimum eigenvalue. Used for lost tracking criteria */
-#define EIGENVALUE_THR 1.0e-04f
-
-/** Constants used for Lucas-Kanade Algorithm */
-#define W_BITS (14)
-#define FLT_SCALE (1.0f / (float)(1 << 20))
-#define D0 ((float)(1 << W_BITS))
-#define D1 (1.0f / (float)(1 << (W_BITS - 5)))
-
-/** Initializes the internal new points array when the level of pyramid is NOT equal to max.
- *
- * @param[in,out] old_points_internal An array of internal key points that are defined at the old_images high resolution pyramid.
- * @param[in,out] new_points_internal An array of internal key points that are defined at the new_images high resolution pyramid.
- * @param[in]     scale               Scale factor to apply for the new_point coordinates.
- */
-__kernel void init_level(
-    __global float4 *old_points_internal,
-    __global float4 *new_points_internal,
-    const float      scale)
-{
-    int idx = get_global_id(0);
-
-    // Get old and new keypoints
-    float4 old_point = old_points_internal[idx];
-    float4 new_point = new_points_internal[idx];
-
-    // Scale accordingly with the pyramid_scale
-    old_point.xy *= (float2)(2.0f);
-    new_point.xy *= (float2)(2.0f);
-
-    old_points_internal[idx] = old_point;
-    new_points_internal[idx] = new_point;
-}
-
-/** Initializes the internal new points array when the level of pyramid is equal to max.
- *
- * @param[in]     old_points          An array of key points that are defined at the old_images high resolution pyramid.
- * @param[in,out] old_points_internal An array of internal key points that are defined at the old_images high resolution pyramid.
- * @param[out]    new_points_internal An array of internal key points that are defined at the new_images high resolution pyramid.
- * @param[in]     scale               Scale factor to apply for the new_point coordinates.
- */
-__kernel void init_level_max(
-    __global Keypoint *old_points,
-    __global InternalKeypoint *old_points_internal,
-    __global InternalKeypoint *new_points_internal,
-    const float                scale)
-{
-    int idx = get_global_id(0);
-
-    Keypoint old_point = old_points[idx];
-
-    // Get old keypoint to track
-    InternalKeypoint old_point_internal;
-    old_point_internal.x               = old_point.x * scale;
-    old_point_internal.y               = old_point.y * scale;
-    old_point_internal.tracking_status = 1.f;
-
-    // Store internal keypoints
-    old_points_internal[idx] = old_point_internal;
-    new_points_internal[idx] = old_point_internal;
-}
-
-/** Initializes the new_points array when the level of pyramid is equal to max and if use_initial_estimate = 1.
- *
- * @param[in]     old_points           An array of key points that are defined at the old_images high resolution pyramid.
- * @param[in]     new_points_estimates An array of estimate key points that are defined at the old_images high resolution pyramid.
- * @param[in,out] old_points_internal  An array of internal key points that are defined at the old_images high resolution pyramid.
- * @param[out]    new_points_internal  An array of internal key points that are defined at the new_images high resolution pyramid.
- * @param[in]     scale                Scale factor to apply for the new_point coordinates.
- */
-__kernel void init_level_max_initial_estimate(
-    __global Keypoint *old_points,
-    __global Keypoint *new_points_estimates,
-    __global InternalKeypoint *old_points_internal,
-    __global InternalKeypoint *new_points_internal,
-    const float                scale)
-{
-    int idx = get_global_id(0);
-
-    Keypoint         old_point          = old_points[idx];
-    Keypoint         new_point_estimate = new_points_estimates[idx];
-    InternalKeypoint old_point_internal;
-    InternalKeypoint new_point_internal;
-
-    // Get old keypoint to track
-    old_point_internal.x               = old_point.x * scale;
-    old_point_internal.y               = old_point.y * scale;
-    old_point_internal.tracking_status = 1.f;
-
-    // Get new keypoint to track
-    new_point_internal.x               = new_point_estimate.x * scale;
-    new_point_internal.y               = new_point_estimate.y * scale;
-    new_point_internal.tracking_status = new_point_estimate.tracking_status;
-
-    // Store internal keypoints
-    old_points_internal[idx] = old_point_internal;
-    new_points_internal[idx] = new_point_internal;
-}
-
-/** Truncates the coordinates stored in new_points array
- *
- * @param[in]  new_points_internal An array of estimate key points that are defined at the new_images high resolution pyramid.
- * @param[out] new_points          An array of internal key points that are defined at the new_images high resolution pyramid.
- */
-__kernel void finalize(
-    __global InternalKeypoint *new_points_internal,
-    __global Keypoint *new_points)
-{
-    int idx = get_global_id(0);
-
-    // Load internal keypoint
-    InternalKeypoint new_point_internal = new_points_internal[idx];
-
-    // Calculate output point
-    Keypoint new_point;
-    new_point.x               = round(new_point_internal.x);
-    new_point.y               = round(new_point_internal.y);
-    new_point.strength        = 0.f;
-    new_point.scale           = 0.f;
-    new_point.orientation     = 0.f;
-    new_point.tracking_status = new_point_internal.tracking_status;
-    new_point.error           = 0.f;
-
-    // Store new point
-    new_points[idx] = new_point;
-}
-
-/** Computes A11, A12, A22, min_eig, ival, ixval and iyval at level 0th of the pyramid. These values will be used in step 1.
- *
- * @param[in]      old_image_ptr                               Pointer to the input old image. Supported data types: U8
- * @param[in]      old_image_stride_x                          Stride of the input old image in X dimension (in bytes)
- * @param[in]      old_image_step_x                            old_image_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      old_image_stride_y                          Stride of the input old image in Y dimension (in bytes)
- * @param[in]      old_image_step_y                            old_image_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]      old_image_offset_first_element_in_bytes     The offset of the first element in the input old image
- * @param[in]      old_scharr_gx_ptr                           Pointer to the input scharr x image. Supported data types: S16
- * @param[in]      old_scharr_gx_stride_x                      Stride of the input scharr x image in X dimension (in bytes)
- * @param[in]      old_scharr_gx_step_x                        old_scharr_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      old_scharr_gx_stride_y                      Stride of the input scharr x image in Y dimension (in bytes)
- * @param[in]      old_scharr_gx_step_y                        old_scharr_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]      old_scharr_gx_offset_first_element_in_bytes The offset of the first element in the input scharr x image
- * @param[in]      old_scharr_gy_ptr                           Pointer to the input scharr y image. Supported data types: S16
- * @param[in]      old_scharr_gy_stride_x                      Stride of the input scharr y image in X dimension (in bytes)
- * @param[in]      old_scharr_gy_step_x                        old_scharr_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      old_scharr_gy_stride_y                      Stride of the input scharr y image in Y dimension (in bytes)
- * @param[in]      old_scharr_gy_step_y                        old_scharr_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]      old_scharr_gy_offset_first_element_in_bytes The offset of the first element in the input scharr y image
- * @param[in]      old_points                                  An array of key points. Those key points are defined at the old_images high resolution pyramid
- * @param[in, out] new_points                                  An output array of key points. Those key points are defined at the new_images high resolution pyramid
- * @param[out]     coeff                                       It stores | A11 | A12 | A22 | min_eig | for each keypoint
- * @param[out]     iold_val                                    It stores | ival | ixval | iyval | dummy | for each point in the window centered on old_keypoint
- * @param[in]      window_dimension                            The size of the window on which to perform the algorithm
- * @param[in]      window_dimension_pow2                       The squared size of the window on which to perform the algorithm
- * @param[in]      half_window                                 The half size of the window on which to perform the algorithm
- * @param[in]      border_limits                               It stores the right border limit (width - window_dimension - 1, height - window_dimension - 1,)
- * @param[in]      eig_const                                   1.0f / (float)(2.0f * window_dimension * window_dimension)
- * @param[in]      level0                                      It is set to 1 if level 0 of the pyramid
- */
-void __kernel lktracker_stage0(
-    IMAGE_DECLARATION(old_image),
-    IMAGE_DECLARATION(old_scharr_gx),
-    IMAGE_DECLARATION(old_scharr_gy),
-    __global float4 *old_points,
-    __global float4 *new_points,
-    __global float4 *coeff,
-    __global short4 *iold_val,
-    const int        window_dimension,
-    const int        window_dimension_pow2,
-    const int        half_window,
-    const float3     border_limits,
-    const float      eig_const,
-    const int        level0)
-{
-    int idx = get_global_id(0);
-
-    Image old_image     = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_image);
-    Image old_scharr_gx = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_scharr_gx);
-    Image old_scharr_gy = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_scharr_gy);
-
-    // Get old keypoint
-    float2 old_keypoint = old_points[idx].xy - (float2)half_window;
-
-    // Get the floor value
-    float2 iold_keypoint = floor(old_keypoint);
-
-    // Check if using the window dimension we can go out of boundary in the following for loops. If so, invalidate the tracked point
-    if(any(iold_keypoint < border_limits.zz) || any(iold_keypoint >= border_limits.xy))
-    {
-        if(level0 == 1)
-        {
-            // Invalidate tracked point as we are at level 0
-            new_points[idx].s2 = 0.0f;
-        }
-
-        // Not valid coordinate. It sets min_eig to 0.0f
-        coeff[idx].s3 = 0.0f;
-
-        return;
-    }
-
-    // Compute weight for the bilinear interpolation
-    float2 ab = old_keypoint - iold_keypoint;
-
-    // Weight used for Bilinear-Interpolation on Scharr images
-    // w_scharr.s0 = (1.0f - ab.x) * (1.0f - ab.y)
-    // w_scharr.s1 = ab.x * (1.0f - ab.y)
-    // w_scharr.s2 = (1.0f - ab.x) * ab.y
-    // w_scharr.s3 = ab.x * ab.y
-
-    float4 w_scharr;
-    w_scharr.s3  = ab.x * ab.y;
-    w_scharr.s0  = w_scharr.s3 + 1.0f - ab.x - ab.y;
-    w_scharr.s12 = ab - (float2)w_scharr.s3;
-
-    // Weight used for Bilinear-Interpolation on Old and New images
-    // w.s0 = round(w_scharr.s0 * D0)
-    // w.s1 = round(w_scharr.s1 * D0)
-    // w.s2 = round(w_scharr.s2 * D0)
-    // w.s3 = w.s3 = D0 - w.s0 - w.s1 - w.s2
-
-    float4 w;
-    w    = round(w_scharr * (float4)D0);
-    w.s3 = D0 - w.s0 - w.s1 - w.s2; // Added for matching VX implementation
-
-    // G.s0 = A11, G.s1 = A12, G.s2 = A22, G.s3 = min_eig
-    int4 iG = (int4)0;
-
-    // Window offset
-    int window_offset = idx * window_dimension_pow2;
-
-    // Compute Spatial Gradient Matrix G
-    for(ushort ky = 0; ky < window_dimension; ++ky)
-    {
-        int offset_y = iold_keypoint.y + ky;
-        for(ushort kx = 0; kx < window_dimension; ++kx)
-        {
-            int    offset_x = iold_keypoint.x + kx;
-            float4 px;
-
-            // Load values from old_image for computing the bilinear interpolation
-            px = convert_float4((uchar4)(vload2(0, offset(&old_image, offset_x, offset_y)),
-                                         vload2(0, offset(&old_image, offset_x, offset_y + 1))));
-
-            // old_i.s0 = ival, old_i.s1 = ixval, old_i.s2 = iyval, old_i.s3 = dummy
-            float4 old_i;
-
-            // Compute bilinear interpolation (with D1 scale factor) for ival
-            old_i.s0 = dot(px, w) * D1;
-
-            // Load values from old_scharr_gx for computing the bilinear interpolation
-            px = convert_float4((short4)(vload2(0, (__global short *)offset(&old_scharr_gx, offset_x, offset_y)),
-                                         vload2(0, (__global short *)offset(&old_scharr_gx, offset_x, offset_y + 1))));
-
-            // Compute bilinear interpolation for ixval
-            old_i.s1 = dot(px, w_scharr);
-
-            // Load values from old_scharr_gy for computing the bilinear interpolation
-            px = convert_float4((short4)(vload2(0, (__global short *)offset(&old_scharr_gy, offset_x, offset_y)),
-                                         vload2(0, (__global short *)offset(&old_scharr_gy, offset_x, offset_y + 1))));
-
-            // Compute bilinear interpolation for iyval
-            old_i.s2 = dot(px, w_scharr);
-
-            // Rounding (it could be omitted. Used just for matching the VX implementation)
-            int4 iold = convert_int4(round(old_i));
-
-            // Accumulate values in the Spatial Gradient Matrix
-            iG.s0 += (int)(iold.s1 * iold.s1);
-            iG.s1 += (int)(iold.s1 * iold.s2);
-            iG.s2 += (int)(iold.s2 * iold.s2);
-
-            // Store ival, ixval and iyval
-            iold_val[window_offset + kx] = convert_short4(iold);
-        }
-        window_offset += window_dimension;
-    }
-
-    // Scale iA11, iA12 and iA22
-    float4 G = convert_float4(iG) * (float4)FLT_SCALE;
-
-    // Compute minimum eigen value
-    G.s3 = (float)(G.s2 + G.s0 - sqrt(pown(G.s0 - G.s2, 2) + 4.0f * G.s1 * G.s1)) * eig_const;
-
-    // Store A11. A11, A22 and min_eig
-    coeff[idx] = G;
-}
-
-/** Computes the motion vector for a given keypoint
- *
- * @param[in]      new_image_ptr                           Pointer to the input new image. Supported data types: U8
- * @param[in]      new_image_stride_x                      Stride of the input new image in X dimension (in bytes)
- * @param[in]      new_image_step_x                        new_image_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      new_image_stride_y                      Stride of the input new image in Y dimension (in bytes)
- * @param[in]      new_image_step_y                        new_image_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]      new_image_offset_first_element_in_bytes The offset of the first element in the input new image
- * @param[in, out] new_points                              An output array of key points. Those key points are defined at the new_images high resolution pyramid
- * @param[in]      coeff                                   The | A11 | A12 | A22 | min_eig | for each keypoint
- * @param[in]      iold_val                                The | ival | ixval | iyval | dummy | for each point in the window centered on old_keypoint
- * @param[in]      window_dimension                        The size of the window on which to perform the algorithm
- * @param[in]      window_dimension_pow2                   The squared size of the window on which to perform the algorithm
- * @param[in]      half_window                             The half size of the window on which to perform the algorithm
- * @param[in]      num_iterations                          The maximum number of iterations
- * @param[in]      epsilon                                 The value for terminating the algorithm.
- * @param[in]      border_limits                           It stores the right border limit (width - window_dimension - 1, height - window_dimension - 1,)
- * @param[in]      eig_const                               1.0f / (float)(2.0f * window_dimension * window_dimension)
- * @param[in]      level0                                  It is set to 1 if level of pyramid = 0
- * @param[in]      term_epsilon                            It is set to 1 if termination = TERM_CRITERIA_EPSILON
- */
-void __kernel lktracker_stage1(
-    IMAGE_DECLARATION(new_image),
-    __global float4 *new_points,
-    __global float4 *coeff,
-    __global short4 *iold_val,
-    const int        window_dimension,
-    const int        window_dimension_pow2,
-    const int        half_window,
-    const int        num_iterations,
-    const float      epsilon,
-    const float3     border_limits,
-    const float      eig_const,
-    const int        level0,
-    const int        term_epsilon)
-{
-    int   idx       = get_global_id(0);
-    Image new_image = CONVERT_TO_IMAGE_STRUCT_NO_STEP(new_image);
-
-    // G.s0 = A11, G.s1 = A12, G.s2 = A22, G.s3 = min_eig
-    float4 G = coeff[idx];
-
-    // Determinant
-    float D = G.s0 * G.s2 - G.s1 * G.s1;
-
-    // Check if it is a good point to track
-    if(G.s3 < EIGENVALUE_THR || D < DETERMINANT_THR)
-    {
-        if(level0 == 1)
-        {
-            // Invalidate tracked point as we are at level 0
-            new_points[idx].s2 = 0;
-        }
-
-        return;
-    }
-
-    // Compute inverse
-    //D = native_recip(D);
-    D = 1.0 / D;
-
-    // Get new keypoint
-    float2 new_keypoint = new_points[idx].xy - (float)half_window;
-
-    // Get new point
-    float2 out_new_point = new_points[idx].xy;
-
-    // Keep delta obtained in the previous iteration
-    float2 prev_delta = (float2)0.0f;
-
-    int j = 0;
-    while(j < num_iterations)
-    {
-        // Get the floor value
-        float2 inew_keypoint = floor(new_keypoint);
-
-        // Check if using the window dimension we can go out of boundary in the following for loops. If so, invalidate the tracked point
-        if(any(inew_keypoint < border_limits.zz) || any(inew_keypoint >= border_limits.xy))
-        {
-            if(level0 == 1)
-            {
-                // Invalidate tracked point as we are at level 0
-                new_points[idx].s2 = 0.0f;
-            }
-            else
-            {
-                new_points[idx].xy = out_new_point;
-            }
-
-            return;
-        }
-
-        // Compute weight for the bilinear interpolation
-        float2 ab = new_keypoint - inew_keypoint;
-
-        // Weight used for Bilinear-Interpolation on Old and New images
-        // w.s0 = round((1.0f - ab.x) * (1.0f - ab.y) * D0)
-        // w.s1 = round(ab.x * (1.0f - ab.y) * D0)
-        // w.s2 = round((1.0f - ab.x) * ab.y * D0)
-        // w.s3 = D0 - w.s0 - w.s1 - w.s2
-
-        float4 w;
-        w.s3  = ab.x * ab.y;
-        w.s0  = w.s3 + 1.0f - ab.x - ab.y;
-        w.s12 = ab - (float2)w.s3;
-        w     = round(w * (float4)D0);
-        w.s3  = D0 - w.s0 - w.s1 - w.s2;
-
-        // Mismatch vector
-        int2 ib = 0;
-
-        // Old val offset
-        int old_val_offset = idx * window_dimension_pow2;
-
-        for(int ky = 0; ky < window_dimension; ++ky)
-        {
-            for(int kx = 0; kx < window_dimension; ++kx)
-            {
-                // ival, ixval and iyval have been computed in the previous stage
-                int4 old_ival = convert_int4(iold_val[old_val_offset]);
-
-                // Load values from old_image for computing the bilinear interpolation
-                float4 px = convert_float4((uchar4)(vload2(0, offset(&new_image, inew_keypoint.x + kx, inew_keypoint.y + ky)),
-                                                    vload2(0, offset(&new_image, inew_keypoint.x + kx, inew_keypoint.y + ky + 1))));
-
-                // Compute bilinear interpolation on new image
-                int jval = (int)round(dot(px, w) * D1);
-
-                // Compute luminance difference
-                int diff = (int)(jval - old_ival.s0);
-
-                // Accumulate values in mismatch vector
-                ib += (diff * old_ival.s12);
-
-                // Update old val offset
-                old_val_offset++;
-            }
-        }
-
-        float2 b = convert_float2(ib) * (float2)FLT_SCALE;
-
-        // Optical Flow
-        float2 delta;
-
-        delta.x = (float)((G.s1 * b.y - G.s2 * b.x) * D);
-        delta.y = (float)((G.s1 * b.x - G.s0 * b.y) * D);
-
-        // Update new point coordinate
-        new_keypoint += delta;
-
-        out_new_point = new_keypoint + (float2)half_window;
-
-        if(term_epsilon == 1)
-        {
-            float mag2 = dot(delta, delta);
-
-            if(mag2 <= epsilon)
-            {
-                new_points[idx].xy = out_new_point;
-
-                return;
-            }
-        }
-
-        // Check convergence analyzing the previous delta
-        if(j > 0 && all(fabs(delta + prev_delta) < (float2)0.01f))
-        {
-            out_new_point -= delta * (float2)0.5f;
-
-            new_points[idx].xy = out_new_point;
-
-            return;
-        }
-
-        // Update previous delta
-        prev_delta = delta;
-
-        j++;
-    }
-
-    new_points[idx].xy = out_new_point;
-}

diff --git a/src/core/CL/cl_kernels/scharr_filter.cl b/src/core/CL/cl_kernels/scharr_filter.cl
deleted file mode 100644
index d2868b6..0000000
--- a/src/core/CL/cl_kernels/scharr_filter.cl
+++ /dev/null

@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This OpenCL kernel computes Scharr3x3.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] dst_gx_ptr                           Pointer to the destination image Supported data types: S16
- * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void scharr3x3(
-    IMAGE_DECLARATION(src)
-#ifdef GRAD_X
-    ,
-    IMAGE_DECLARATION(dst_gx)
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    ,
-    IMAGE_DECLARATION(dst_gy)
-#endif /* GRAD_Y */
-)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-#ifdef GRAD_X
-    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-
-    // Output pixels
-#ifdef GRAD_X
-    short8 gx = (short8)0;
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    short8 gy = (short8)0;
-#endif /* GRAD_Y */
-
-    // Row0
-    uchar16 temp   = vload16(0, offset(&src, -1, -1));
-    short8  left   = convert_short8(temp.s01234567);
-    short8  middle = convert_short8(temp.s12345678);
-    short8  right  = convert_short8(temp.s23456789);
-#ifdef GRAD_X
-    gx += left * (short8)(-3);
-    gx += right * (short8)(+3);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    gy += left * (short8)(-3);
-    gy += middle * (short8)(-10);
-    gy += right * (short8)(-3);
-#endif /* GRAD_Y */
-
-    // Row1
-    temp  = vload16(0, offset(&src, -1, 0));
-    left  = convert_short8(temp.s01234567);
-    right = convert_short8(temp.s23456789);
-#ifdef GRAD_X
-    gx += left * (short8)(-10);
-    gx += right * (short8)(+10);
-#endif /* GRAD_X */
-
-    // Row2
-    temp   = vload16(0, offset(&src, -1, 1));
-    left   = convert_short8(temp.s01234567);
-    middle = convert_short8(temp.s12345678);
-    right  = convert_short8(temp.s23456789);
-#ifdef GRAD_X
-    gx += left * (short8)(-3);
-    gx += right * (short8)(+3);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    gy += left * (short8)(+3);
-    gy += middle * (short8)(+10);
-    gy += right * (short8)(+3);
-#endif /* GRAD_Y */
-
-    // Store results
-#ifdef GRAD_X
-    vstore8(gx, 0, ((__global short *)dst_gx.ptr));
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    vstore8(gy, 0, ((__global short *)dst_gy.ptr));
-#endif /* GRAD_Y */
-}

diff --git a/src/core/CL/cl_kernels/tablelookup.cl b/src/core/CL/cl_kernels/tablelookup.cl
deleted file mode 100644
index 0ef1648..0000000
--- a/src/core/CL/cl_kernels/tablelookup.cl
+++ /dev/null

@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function performs table lookup on U8 input/output images.
- *
- * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
- *
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  lut                               LUT table. Supported data types: U8
- */
-__kernel void tablelookup_U8(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst),
-    __global uchar *lut)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    /* Load input data */
-    uchar8 data = vload8(0, src.ptr);
-
-    /* Load lut data */
-    uchar8 lut_data = (uchar8)(lut[data.s0], lut[data.s1], lut[data.s2], lut[data.s3],
-                               lut[data.s4], lut[data.s5], lut[data.s6], lut[data.s7]);
-
-    /* Store result */
-    vstore8(lut_data, 0, dst.ptr);
-}
-
-/** This function performs table lookup on S16 input/output images.
- *
- * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: S16
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  lut                               LUT table. Supported data types: S16
- * @param[in]  offset                            LUT offset
- * @param[in]  count                             Number of elements in the LUT
- */
-__kernel void tablelookup_S16(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst),
-    __global short *lut,
-    uint            offset,
-    uint            count)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    /* Load input data */
-    short8 data = vload8(0, (__global short *)src.ptr);
-
-    /* Load output data */
-    int8 out_data = convert_int8(vload8(0, (__global short *)dst.ptr));
-
-    /* Calculate index */
-    int8 index = convert_int8(data) + (int8)(offset);
-    int8 cond  = (index >= 0 && index < (int8)count);
-    index      = select(0, index, cond);
-
-    /* Load lut data */
-    int8 lut_data = (int8)(lut[index.s0], lut[index.s1], lut[index.s2], lut[index.s3],
-                           lut[index.s4], lut[index.s5], lut[index.s6], lut[index.s7]);
-
-    /* Select output data depending on condition */
-    lut_data = select(out_data, lut_data, cond);
-
-    /* Store result */
-    vstore8(convert_short8(lut_data), 0, (__global short *)dst.ptr);
-}

diff --git a/src/core/CL/cl_kernels/threshold.cl b/src/core/CL/cl_kernels/threshold.cl
deleted file mode 100644
index ff3ac05..0000000
--- a/src/core/CL/cl_kernels/threshold.cl
+++ /dev/null

@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Perform binary thresholding on an image.
- *
- * @param[in]  in_ptr                            Pointer to the source image
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the first source image
- * @param[out] out_ptr                           Pointer to the destination image
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  false_val                         False value
- * @param[in]  true_val                          True value
- * @param[in]  threshold                         The thresold value
- */
-__kernel void threshold_binary(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const uchar false_val,
-    const uchar true_val,
-    const uchar threshold)
-{
-    // Get pixels pointer
-    Image in  = CONVERT_TO_IMAGE_STRUCT(in);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
-
-    // Load data
-    uchar16 in_data = vload16(0, in.ptr);
-
-    // Perform binary thresholding
-    in_data = select((uchar16)false_val, (uchar16)true_val, in_data > (uchar16)threshold);
-
-    // Store result
-    vstore16(in_data, 0, out.ptr);
-}
-
-/** Perform range thresholding on an image.
- *
- * @param[in]  in_ptr                            Pointer to the source image
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the first source image
- * @param[out] out_ptr                           Pointer to the destination image
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  false_val                         False value
- * @param[in]  true_val                          True value
- * @param[in]  lower                             Lower threshold
- * @param[in]  upper                             Upper threshold
- */
-__kernel void threshold_range(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const uchar false_val,
-    const uchar true_val,
-    const uchar lower,
-    const uchar upper)
-{
-    // Get pixels pointer
-    Image in  = CONVERT_TO_IMAGE_STRUCT(in);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
-
-    // Load data
-    uchar16 in_data = vload16(0, in.ptr);
-
-    // Perform range thresholding
-    in_data = select((uchar16)true_val, (uchar16)false_val, in_data > (uchar16)upper || in_data < (uchar16)lower);
-
-    // Store result
-    vstore16(in_data, 0, out.ptr);
-}

diff --git a/src/core/CL/cl_kernels/warp_affine.cl b/src/core/CL/cl_kernels/warp_affine.cl
deleted file mode 100644
index 909b920..0000000
--- a/src/core/CL/cl_kernels/warp_affine.cl
+++ /dev/null

@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "warp_helpers.h"
-
-/** Returns a vector of floats contaning the matrix coefficients. */
-inline const float8 build_affine_mtx()
-{
-    return (float8)(MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, 0, 0);
-}
-
-/** Transforms 4 2D coordinates using the formula:
- *
- *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
- *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
- *
- * @param[in] coord 2D coordinate to transform.
- * @param[in] mtx   affine matrix
- *
- * @return a int8 containing 4 2D transformed values.
- */
-inline const float8 apply_affine_transform(const float2 coord, const float8 mtx)
-{
-    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
-    // transform [x,x+1,x+2,x+3]
-    const float4 new_x = mad(/*A*/ in_x_coords, (float4)(mtx.s0) /*B*/, mad((float4)(coord.s1), (float4)(mtx.s2), (float4)(mtx.s4)));
-    // transform [y,y+1,y+2,y+3]
-    const float4 new_y = mad(in_x_coords, (float4)(mtx.s1), mad((float4)(coord.s1), (float4)(mtx.s3), (float4)(mtx.s5)));
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-}
-
-/** Performs an affine transform on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8.
- *
- * This kernel performs an affine transform with a 2x3 Matrix M with this method of pixel coordinate translation:
- *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
- *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
- *   output(x,y) = input(x0,y0)
- *
- * @attention The matrix coefficients need to be passed at compile time:\n
- * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=1 -DMAT3=2 -DMAT4=4 -DMAT5=2 "\n
- * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
- * @param[in]  width                             Width of the destination image
- * @param[in]  height                            Height of the destination image
- */
-__kernel void warp_affine_nearest_neighbour(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const int width,
-    const int height)
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
-    vstore4(read_texels4(&in, convert_int8_rtn(clamp_to_border(apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height))), 0, out.ptr);
-}
-
-/** Performs an affine transform on an image interpolating with the BILINEAR method. Input and output are single channel U8.
- *
- * @attention The matrix coefficients need to be passed at compile time:\n
- * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=1 -DMAT3=2 -DMAT4=4 -DMAT5=2 "\n
- * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
- * @param[in]  width                             Width of the destination image
- * @param[in]  height                            Height of the destination image
- */
-__kernel void warp_affine_bilinear(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const int width,
-    const int height)
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
-    vstore4(bilinear_interpolate(&in, apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height), 0, out.ptr);
-}

diff --git a/src/core/CL/cl_kernels/warp_perspective.cl b/src/core/CL/cl_kernels/warp_perspective.cl
deleted file mode 100644
index bed7838..0000000
--- a/src/core/CL/cl_kernels/warp_perspective.cl
+++ /dev/null

@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "warp_helpers.h"
-
-/** Returns the perspective matrix */
-inline const float16 build_perspective_mtx()
-{
-    return (float16)(MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, 0, 0, 0, (float4)0);
-}
-
-/** Transforms four 2D coordinates using the formula:
- *
- *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
- *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
- *   z0 = M[3][1] * x + M[3][2] * y + M[3][3]
- *
- *   (x0/z0,y0/z0)
- *
- * @param[in] coord 2D coordinate to transform.
- * @param[in] mtx   perspective matrix
- *
- * @return a vector float8 containing four 2D transformed values.
- */
-inline const float8 apply_perspective_transform(const float2 coord, const float16 mtx)
-{
-    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
-    // transform [z,z+1,z+2,z+3]
-    const float4 z = (float4)mad(in_x_coords, (float4)(mtx.s2), mad((float4)(coord.s1), (float4)(mtx.s5), (float4)(mtx.s8)));
-    // NOTE: Do not multiply x&y by 1.f/Z as this will result in loss of accuracy and mismatches with VX reference implementation
-    // transform [x,x+1,x+2,x+3]
-    const float4 new_x = (float4)mad(in_x_coords, (float4)(mtx.s0), mad((float4)(coord.s1), (float4)(mtx.s3), (float4)(mtx.s6))) / z;
-    // transform [y,y+1,y+2,y+3]
-    const float4 new_y = (float4)mad(in_x_coords, (float4)(mtx.s1), mad((float4)(coord.s1), (float4)(mtx.s4), (float4)(mtx.s7))) / z;
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-}
-
-/** Performs perspective transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8.
- *
- * This kernel performs perspective transform with a 3x3 Matrix M with this method of pixel coordinate translation:
- *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
- *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
- *   z0 = M[3][1] * x + M[3][2] * y + M[3][3]
- *
- *   output(x,y) = input(x0/z0,y0/z0)
- *
- * @attention The matrix coefficients need to be passed at compile time:\n
- * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=3 -DMAT3=4 -DMAT4=5 -DMAT5=6 -DMAT6=7 -DMAT7=8 -DMAT8=9"\n
- * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
- * @param[in]  width                             Width of the destination image
- * @param[in]  height                            Height of the destination image
- */
-__kernel void warp_perspective_nearest_neighbour(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const int width,
-    const int height)
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
-    vstore4(read_texels4(&in, convert_int8_rtn(clamp_to_border(apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height))), 0, out.ptr);
-}
-
-/** Performs a perspective transform on an image interpolating with the BILINEAR method. Input and output are single channel U8.
- *
- * @attention The matrix coefficients need to be passed at compile time:\n
- * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=3 -DMAT3=4 -DMAT4=5 -DMAT5=6 -DMAT6=7 -DMAT7=8 -DMAT8=9"\n
- * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
- * @param[in]  width                             Width of the destination image
- * @param[in]  height                            Height of the destination image
- */
-__kernel void warp_perspective_bilinear(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const int width,
-    const int height)
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
-    vstore4(bilinear_interpolate(&in, apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height), 0, out.ptr);
-}

diff --git a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp
deleted file mode 100644
index 76b60cb..0000000
--- a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp
+++ /dev/null

@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-
-#include "src/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLAbsoluteDifferenceKernel::CLAbsoluteDifferenceKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLAbsoluteDifferenceKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
-}
-
-void CLAbsoluteDifferenceKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
-                             "The output image can only be U8 if both input images are U8");
-
-    _input1 = input1;
-    _input2 = input2;
-    _output = output;
-
-    // Set kernel build options
-    std::set<std::string> build_opts;
-    build_opts.insert("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
-    build_opts.insert("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
-    build_opts.insert("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "absdiff", build_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input1_access, input2_access, output_access);
-
-    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
-                                                       input2->info()->valid_region());
-
-    output_access.set_valid_region(win, valid_region);
-
-    ICLKernel::configure_internal(win);
-}
-
-void CLAbsoluteDifferenceKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input1, slice);
-        add_2D_tensor_argument(idx, _input2, slice);
-        add_2D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}

diff --git a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.h b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.h
deleted file mode 100644
index 28f28fe..0000000
--- a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.h
+++ /dev/null

@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLABSOLUTEDIFFERENCEKERNEL_H
-#define ARM_COMPUTE_CLABSOLUTEDIFFERENCEKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the absolute difference kernel.
- *
- * Absolute difference is computed by:
- * @f[ output(x,y) = | input1(x,y) - input2(x,y) | @f]
- */
-class CLAbsoluteDifferenceKernel : public ICLKernel
-{
-public:
-    /** Default constructor. */
-    CLAbsoluteDifferenceKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLAbsoluteDifferenceKernel(const CLAbsoluteDifferenceKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLAbsoluteDifferenceKernel &operator=(const CLAbsoluteDifferenceKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLAbsoluteDifferenceKernel(CLAbsoluteDifferenceKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLAbsoluteDifferenceKernel &operator=(CLAbsoluteDifferenceKernel &&) = default;
-    /** Default destructor */
-    ~CLAbsoluteDifferenceKernel() = default;
-
-    /** Set the inputs and output images.
-     *
-     * @param[in]  input1 Source tensor. Data types supported: U8/S16.
-     * @param[in]  input2 Source tensor. Data types supported: U8/S16.
-     * @param[out] output Destination tensor. Data types supported: U8/S16.
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-    /** Set the inputs and output images.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          Source tensor. Data types supported: U8/S16.
-     * @param[in]  input2          Source tensor. Data types supported: U8/S16.
-     * @param[out] output          Destination tensor. Data types supported: U8/S16.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input1; /**< Source tensor 1. */
-    const ICLTensor *_input2; /**< Source tensor 2. */
-    ICLTensor       *_output; /**< Destination tensor. */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLABSOLUTEDIFFERENCEKERNEL_H */

diff --git a/src/core/CL/kernels/CLAccumulateKernel.cpp b/src/core/CL/kernels/CLAccumulateKernel.cpp
deleted file mode 100644
index b0a8eba..0000000
--- a/src/core/CL/kernels/CLAccumulateKernel.cpp
+++ /dev/null

@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLAccumulateKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-
-namespace arm_compute
-{
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-} // namespace
-
-void CLAccumulateKernel::configure(const ICLTensor *input, ICLTensor *accum)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, accum);
-}
-
-void CLAccumulateKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *accum)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "accumulate");
-
-    // Make sure _kernel is initialized before calling the parent's configure
-    ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration);
-}
-
-void CLAccumulateWeightedKernel::configure(const ICLTensor *input, float alpha, ICLTensor *accum)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, alpha, accum);
-}
-
-void CLAccumulateWeightedKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, float alpha, ICLTensor *accum)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(alpha < 0.0 || alpha > 1.0);
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "accumulate_weighted");
-
-    // Set static kernel arguments
-    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg(idx++, alpha);
-
-    // Configure kernel window
-    ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration);
-}
-
-void CLAccumulateSquaredKernel::configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, shift, accum);
-}
-
-void CLAccumulateSquaredKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, uint32_t shift, ICLTensor *accum)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
-    ARM_COMPUTE_ERROR_ON(shift > 15);
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "accumulate_squared");
-
-    // Set static kernel arguments
-    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg(idx++, shift);
-
-    // Configure kernel window
-    ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration);
-}
-} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLAccumulateKernel.h b/src/core/CL/kernels/CLAccumulateKernel.h
deleted file mode 100644
index 16a7153..0000000
--- a/src/core/CL/kernels/CLAccumulateKernel.h
+++ /dev/null

@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLACCUMULATEKERNEL_H
-#define ARM_COMPUTE_CLACCUMULATEKERNEL_H
-
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the accumulate kernel.
- *
- * Accumulation is computed by:
- * @f[ accum(x,y) = accum(x,y) + input(x,y) @f]
- */
-class CLAccumulateKernel : public ICLSimple2DKernel
-{
-public:
-    /** Set the input and accumulation tensors.
-     *
-     * @param[in]  input Source tensor. Data types supported: U8.
-     * @param[out] accum Destination tensor. Data types supported: S16.
-     */
-    void configure(const ICLTensor *input, ICLTensor *accum);
-    /** Set the input and accumulation tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8.
-     * @param[out] accum           Destination tensor. Data types supported: S16.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *accum);
-};
-
-/** Interface for the accumulate weighted kernel.
- *
- * Weighted accumulation is computed:
- * @f[ accum(x,y) = (1 - \alpha)*accum(x,y) + \alpha*input(x,y) @f]
- *
- * Where @f$ 0 \le \alpha \le 1 @f$
- * Conceptually, the rounding for this is defined as:
- * @f[ output(x,y)= uint8( (1 - \alpha) * float32( int32( output(x,y) ) ) + \alpha * float32( int32( input(x,y) ) ) ) @f]
-*/
-class CLAccumulateWeightedKernel : public ICLSimple2DKernel
-{
-public:
-    /** Set the input and accumulation images, and the scale value.
-     *
-     * @param[in]     input Source tensor. Data types supported: U8.
-     * @param[in]     alpha Scalar value in the range [0, 1.0]. Data types supported: F32.
-     * @param[in,out] accum Accumulated tensor. Data types supported: U8.
-     */
-    void configure(const ICLTensor *input, float alpha, ICLTensor *accum);
-    /** Set the input and accumulation images, and the scale value.
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Source tensor. Data types supported: U8.
-     * @param[in]     alpha           Scalar value in the range [0, 1.0]. Data types supported: F32.
-     * @param[in,out] accum           Accumulated tensor. Data types supported: U8.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, float alpha, ICLTensor *accum);
-};
-
-/** Interface for the accumulate squared kernel.
- *
- * The accumulation of squares is computed:
- * @f[ accum(x,y) = saturate_{int16} ( (uint16) accum(x,y) + (((uint16)(input(x,y)^2)) >> (shift)) ) @f]
- *
- * Where @f$ 0 \le shift \le 15 @f$
-*/
-class CLAccumulateSquaredKernel : public ICLSimple2DKernel
-{
-public:
-    /** Set the input and accumulation tensors and the shift value.
-     *
-     * @param[in]     input Source tensor. Data types supported: U8.
-     * @param[in]     shift Shift value in the range of [0, 15]. Data types supported: U32.
-     * @param[in,out] accum Accumulated tensor. Data types supported: S16.
-     */
-    void configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum);
-    /** Set the input and accumulation tensors and the shift value.
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Source tensor. Data types supported: U8.
-     * @param[in]     shift           Shift value in the range of [0, 15]. Data types supported: U32.
-     * @param[in,out] accum           Accumulated tensor. Data types supported: S16.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, uint32_t shift, ICLTensor *accum);
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLACCUMULATEKERNEL_H */

diff --git a/src/core/CL/kernels/CLBox3x3Kernel.cpp b/src/core/CL/kernels/CLBox3x3Kernel.cpp
deleted file mode 100644
index 9f493b4..0000000
--- a/src/core/CL/kernels/CLBox3x3Kernel.cpp
+++ /dev/null

@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLBox3x3Kernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-BorderSize CLBox3x3Kernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void CLBox3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined);
-}
-
-void CLBox3x3Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
-    _input  = input;
-    _output = output;
-
-    // Set build options
-    std::set<std::string> build_opts = { "-DMAT0=1", "-DMAT1=1", "-DMAT2=1",
-                                         "-DMAT3=1", "-DMAT4=1", "-DMAT5=1",
-                                         "-DMAT6=1", "-DMAT7=1", "-DMAT8=1",
-                                         "-DSCALE=9", "-DDATA_TYPE_OUT=uchar"
-                                       };
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "convolution3x3_static", build_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-}

diff --git a/src/core/CL/kernels/CLBox3x3Kernel.h b/src/core/CL/kernels/CLBox3x3Kernel.h
deleted file mode 100644
index 2373c4a..0000000
--- a/src/core/CL/kernels/CLBox3x3Kernel.h
+++ /dev/null

@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLBOX3X3KERNEL_H
-#define ARM_COMPUTE_CLBOX3X3KERNEL_H
-
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the box 3x3 filter kernel.
- *
- */
-class CLBox3x3Kernel : public ICLSimple2DKernel
-{
-public:
-    /**Initialise the kernel's input and output.
-     *
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /**Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-    //Inherited methods overriden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLBOX3X3KERNEL_H */

diff --git a/src/core/CL/kernels/CLCannyEdgeKernel.cpp b/src/core/CL/kernels/CLCannyEdgeKernel.cpp
deleted file mode 100644
index 1fe944c..0000000
--- a/src/core/CL/kernels/CLCannyEdgeKernel.cpp
+++ /dev/null

@@ -1,310 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLCannyEdgeKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-CLGradientKernel::CLGradientKernel()
-    : _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
-{
-}
-
-void CLGradientKernel::configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), gx, gy, magnitude, phase, norm_type);
-}
-
-void CLGradientKernel::configure(const CLCompileContext &compile_context, const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_MSG(data_size_from_type(gx->info()->data_type()) != data_size_from_type(gy->info()->data_type()),
-                             "Gx and Gy must have the same pixel size");
-    ARM_COMPUTE_ERROR_ON_MSG(data_size_from_type(gx->info()->data_type()) != data_size_from_type(magnitude->info()->data_type()),
-                             "Mag must have the same pixel size as Gx and Gy");
-
-    _gx        = gx;
-    _gy        = gy;
-    _magnitude = magnitude;
-    _phase     = phase;
-
-    // Create build opts
-    std::set<std::string> built_opts;
-    built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(gx->info()->data_type()));
-    built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(gx->info()->data_type()));
-
-    // Create kernel
-    const std::string kernel_name = (norm_type == 1) ? std::string("combine_gradients_L1") : std::string("combine_gradients_L2");
-    _kernel                       = create_kernel(compile_context, kernel_name, built_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
-
-    Window win = calculate_max_window(*_gx->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal gx_access(_gx->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal gy_access(_gy->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal mag_access(_magnitude->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, gx_access, gy_access, mag_access, phase_access);
-
-    mag_access.set_valid_region(win, _gx->info()->valid_region());
-    phase_access.set_valid_region(win, _gx->info()->valid_region());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(gx->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gx->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gx->info()->dimension(1));
-}
-
-void CLGradientKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _gx, slice);
-        add_2D_tensor_argument(idx, _gy, slice);
-        add_2D_tensor_argument(idx, _magnitude, slice);
-        add_2D_tensor_argument(idx, _phase, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
-
-CLEdgeNonMaxSuppressionKernel::CLEdgeNonMaxSuppressionKernel()
-    : _magnitude(nullptr), _phase(nullptr), _output(nullptr)
-{
-}
-
-BorderSize CLEdgeNonMaxSuppressionKernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void CLEdgeNonMaxSuppressionKernel::configure(const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), magnitude, phase, output, lower_thr, border_undefined);
-}
-
-void CLEdgeNonMaxSuppressionKernel::configure(const CLCompileContext &compile_context, const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::U32);
-
-    _magnitude = magnitude;
-    _phase     = phase;
-    _output    = output;
-
-    // Create build opts
-    std::set<std::string> built_opts;
-    built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(magnitude->info()->data_type()));
-    built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-
-    // Create kernel
-    const std::string kernel_name = std::string("suppress_non_maximum");
-    _kernel                       = create_kernel(compile_context, kernel_name, built_opts);
-
-    // Set minimum threshold argument
-    unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg(idx++, lower_thr);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration    = 1;
-    constexpr unsigned int num_elems_read_written_per_iteration = 3;
-
-    Window win = calculate_max_window(*_magnitude->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle mag_access(_magnitude->info(), -border_size().left, -border_size().top,
-                                     num_elems_read_written_per_iteration, num_elems_read_written_per_iteration);
-    AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, mag_access, phase_access, output_access);
-
-    output_access.set_valid_region(win, _magnitude->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(output->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_undefined);
-}
-
-void CLEdgeNonMaxSuppressionKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _magnitude, slice);
-        add_2D_tensor_argument(idx, _phase, slice);
-        add_2D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
-
-CLEdgeTraceKernel::CLEdgeTraceKernel()
-    : _input(nullptr), _output(nullptr), _lower_thr(0), _upper_thr(0), _visited(nullptr), _recorded(nullptr), _l1_stack(nullptr), _l1_stack_counter(nullptr)
-{
-}
-
-void CLEdgeTraceKernel::configure(const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr,
-                                  ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, upper_thr, lower_thr, visited, recorded, l1_stack, l1_stack_counter);
-}
-
-void CLEdgeTraceKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr,
-                                  ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::U32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(visited, 1, DataType::U32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(recorded, 1, DataType::U32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(l1_stack, 1, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(l1_stack_counter, 1, DataType::U8);
-
-    _input            = input;
-    _output           = output;
-    _lower_thr        = lower_thr;
-    _upper_thr        = upper_thr;
-    _visited          = visited;
-    _recorded         = recorded;
-    _l1_stack         = l1_stack;
-    _l1_stack_counter = l1_stack_counter;
-
-    // Create build opts
-    std::set<std::string> built_opts;
-    built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
-    built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-
-    // Create kernel
-    const std::string kernel_name = std::string("hysteresis");
-    _kernel                       = create_kernel(compile_context, kernel_name, built_opts);
-
-    // Set constant kernel args
-    unsigned int width  = _input->info()->dimension(0);
-    unsigned int height = _input->info()->dimension(1);
-    unsigned int idx    = 6 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg(idx++, static_cast<cl_uint>(_lower_thr));
-    _kernel.setArg(idx++, static_cast<cl_uint>(_upper_thr));
-    _kernel.setArg(idx++, static_cast<cl_uint>(width));
-    _kernel.setArg(idx++, static_cast<cl_uint>(height));
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-    Window                 win                               = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal visited_access(_visited->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal recorded_access(_recorded->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal l1_stack_access(_l1_stack->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal l1_stack_counter_access(_l1_stack_counter->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(_input->info(), 0, num_elems_processed_per_iteration),
-                              output_access,
-                              visited_access,
-                              recorded_access,
-                              l1_stack_access,
-                              l1_stack_counter_access);
-
-    output_access.set_valid_region(win, _input->info()->valid_region());
-    visited_access.set_valid_region(win, _input->info()->valid_region());
-    recorded_access.set_valid_region(win, _input->info()->valid_region());
-    l1_stack_access.set_valid_region(win, _input->info()->valid_region());
-    l1_stack_counter_access.set_valid_region(win, _input->info()->valid_region());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += lower_string(string_from_format(output->info()->format()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-void CLEdgeTraceKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        add_2D_tensor_argument(idx, _output, slice);
-        add_2D_tensor_argument(idx, _visited, slice);
-        add_2D_tensor_argument(idx, _recorded, slice);
-        add_2D_tensor_argument(idx, _l1_stack, slice);
-        add_2D_tensor_argument(idx, _l1_stack_counter, slice);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}

diff --git a/src/core/CL/kernels/CLCannyEdgeKernel.h b/src/core/CL/kernels/CLCannyEdgeKernel.h
deleted file mode 100644
index 7543822..0000000
--- a/src/core/CL/kernels/CLCannyEdgeKernel.h
+++ /dev/null

@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCANNYEDGEKERNEL_H
-#define ARM_COMPUTE_CLCANNYEDGEKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform Gradient computation.
- */
-class CLGradientKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGradientKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGradientKernel(const CLGradientKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGradientKernel &operator=(const CLGradientKernel &) = delete;
-    /** Initialise the kernel's sources, destinations and border mode.
-     *
-     * @note gx, gy and mag must all be the same size (either 16 or 32).
-     *
-     * @param[in]  gx        Source tensor - Gx component. Data types supported: S16/S32.
-     * @param[in]  gy        Source tensor - Gy component. Data types supported: Same as gx.
-     * @param[out] magnitude Destination tensor - Magnitude. Data types supported: U16/U32. Must match the pixel size of gx, gy.
-     * @param[out] phase     Destination tensor - Quantized phase. Data types supported: U8.
-     * @param[in]  norm_type Normalization type. if 1, L1-Norm otherwise L2-Norm.
-     */
-    void configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type);
-    /** Initialise the kernel's sources, destinations and border mode.
-     *
-     * @note gx, gy and mag must all be the same size (either 16 or 32).
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  gx              Source tensor - Gx component. Data types supported: S16/S32.
-     * @param[in]  gy              Source tensor - Gy component. Data types supported: Same as gx.
-     * @param[out] magnitude       Destination tensor - Magnitude. Data types supported: U16/U32. Must match the pixel size of gx, gy.
-     * @param[out] phase           Destination tensor - Quantized phase. Data types supported: U8.
-     * @param[in]  norm_type       Normalization type. if 1, L1-Norm otherwise L2-Norm.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_gx;        /**< Source tensor - Gx component */
-    const ICLTensor *_gy;        /**< Source tensor - Gy component */
-    ICLTensor       *_magnitude; /**< Destination tensor - Magnitude */
-    ICLTensor       *_phase;     /**< Destination tensor - Quantized phase */
-};
-
-/** OpenCL kernel to perform Non-Maxima suppression for Canny Edge.
- *
- * @note This kernel is meant to be used alongside CannyEdge and performs a non-maxima suppression using magnitude and phase of input
- *       to characterize points as possible edges. The output buffer needs to be cleared before this kernel is executed.
- *
- * @note Hysteresis is computed in @ref CLEdgeTraceKernel
- */
-class CLEdgeNonMaxSuppressionKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLEdgeNonMaxSuppressionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLEdgeNonMaxSuppressionKernel(const CLEdgeNonMaxSuppressionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLEdgeNonMaxSuppressionKernel &operator=(const CLEdgeNonMaxSuppressionKernel &) = delete;
-    /** Initialise the kernel's sources, destination and border mode.
-     *
-     * @param[in]  magnitude        Source tensor - Magnitude. Data types supported: U16/U32.
-     * @param[in]  phase            Source tensor - Quantized phase. Data types supported: U8.
-     * @param[out] output           Destination tensor. Data types supported: U16/U32.
-     * @param[in]  lower_thr        Lower threshold.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined);
-    /** Initialise the kernel's sources, destination and border mode.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  magnitude        Source tensor - Magnitude. Data types supported: U16/U32.
-     * @param[in]  phase            Source tensor - Quantized phase. Data types supported: U8.
-     * @param[out] output           Destination tensor. Data types supported: U16/U32.
-     * @param[in]  lower_thr        Lower threshold.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_magnitude; /**< Source tensor - Magnitude. */
-    const ICLTensor *_phase;     /**< Source tensor - Quantized phase. */
-    ICLTensor       *_output;    /**< Destination tensor. */
-};
-
-/** OpenCL kernel to perform Edge tracing.
- */
-class CLEdgeTraceKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLEdgeTraceKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLEdgeTraceKernel(const CLEdgeTraceKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLEdgeTraceKernel &operator=(const CLEdgeTraceKernel &) = delete;
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]     input            Source tensor. Data types supported: U16/U32.
-     * @param[out]    output           Destination tensor. Data types supported: U8.
-     * @param[in]     upper_thr        Upper threshold used for the hysteresis
-     * @param[in]     lower_thr        Lower threshold used for the hysteresis
-     * @param[in,out] visited          Tensor for keeping the visited pixels. Data types supported: U32.
-     *                                 Expected to be initialized to 0 before each run.
-     * @param[in,out] recorded         Tensor for keeping the recorded pixels. Data types supported: U32
-     *                                 Expected to be initialized to 0 before each run.
-     * @param[in,out] l1_stack         Tensor with the L1 stack for each pixel. Data types supported: S32.
-     *                                 Expected to be initialized to 0 before each run.
-     * @param[in,out] l1_stack_counter Tensor for counting the elements in the L1 stack of each pixel. Data types supported: U8.
-     *                                              Expected to be initialized to 0 before each run.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr,
-                   ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter);
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]     compile_context  The compile context to be used.
-     * @param[in]     input            Source tensor. Data types supported: U16/U32.
-     * @param[out]    output           Destination tensor. Data types supported: U8.
-     * @param[in]     upper_thr        Upper threshold used for the hysteresis
-     * @param[in]     lower_thr        Lower threshold used for the hysteresis
-     * @param[in,out] visited          Tensor for keeping the visited pixels. Data types supported: U32.
-     *                                 Expected to be initialized to 0 before each run.
-     * @param[in,out] recorded         Tensor for keeping the recorded pixels. Data types supported: U32
-     *                                 Expected to be initialized to 0 before each run.
-     * @param[in,out] l1_stack         Tensor with the L1 stack for each pixel. Data types supported: S32.
-     *                                 Expected to be initialized to 0 before each run.
-     * @param[in,out] l1_stack_counter Tensor for counting the elements in the L1 stack of each pixel. Data types supported: U8.
-     *                                              Expected to be initialized to 0 before each run.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr,
-                   ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;            /**< Source tensor. */
-    ICLTensor       *_output;           /**< Destination tensor. */
-    int32_t          _lower_thr;        /**< Lower threshold used for the hysteresis. */
-    int32_t          _upper_thr;        /**< Upper threshold used for the hysteresis. */
-    ICLTensor       *_visited;          /**< Marks visited elements */
-    ICLTensor       *_recorded;         /**< Marks recorded elements */
-    ICLTensor       *_l1_stack;         /**< L1 hysteris stack */
-    ICLTensor       *_l1_stack_counter; /**< L1 hysteris stack counter */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLCANNYEDGEKERNEL_H */

diff --git a/src/core/CL/kernels/CLChannelCombineKernel.cpp b/src/core/CL/kernels/CLChannelCombineKernel.cpp
deleted file mode 100644
index 52ba9dd..0000000
--- a/src/core/CL/kernels/CLChannelCombineKernel.cpp
+++ /dev/null

@@ -1,296 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLChannelCombineKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLMultiImage.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/MultiImageInfo.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <set>
-#include <string>
-
-namespace arm_compute
-{
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-} // namespace
-
-CLChannelCombineKernel::CLChannelCombineKernel()
-    : _planes{ { nullptr } }, _output(nullptr), _output_multi(nullptr), _x_subsampling{ { 1, 1, 1 } }, _y_subsampling{ { 1, 1, 1 } }
-{
-}
-
-void CLChannelCombineKernel::configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), plane0, plane1, plane2, plane3, output);
-}
-
-void CLChannelCombineKernel::configure(const CLCompileContext &compile_context, const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane0);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane1);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane2);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
-
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::RGB888, Format::RGBA8888, Format::YUYV422, Format::UYVY422);
-
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8);
-
-    const Format output_format = output->info()->format();
-
-    // Check if horizontal dimension of Y plane is even and validate horizontal sub-sampling dimensions for U and V planes
-    if(Format::YUYV422 == output_format || Format::UYVY422 == output_format)
-    {
-        // Validate Y plane of input and output
-        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(output_format, plane0, output);
-
-        // Validate U and V plane of the input
-        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), plane1, plane2);
-    }
-
-    _planes[0] = plane0;
-    _planes[1] = plane1;
-    _planes[2] = plane2;
-    _planes[3] = nullptr;
-
-    // Validate the last input tensor only for RGBA format
-    if(Format::RGBA8888 == output_format)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(plane3);
-        ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane3);
-
-        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane3, Format::U8);
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane3, 1, DataType::U8);
-
-        _planes[3] = plane3;
-    }
-
-    _output       = output;
-    _output_multi = nullptr;
-
-    // Half the processed elements for U and V channels due to horizontal sub-sampling of 2
-    if(Format::YUYV422 == output_format || Format::UYVY422 == output_format)
-    {
-        _x_subsampling[1] = 2;
-        _x_subsampling[2] = 2;
-    }
-
-    // Create kernel
-    std::string kernel_name = "channel_combine_" + string_from_format(output_format);
-    _kernel                 = create_kernel(compile_context, kernel_name);
-
-    // Configure window
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal plane0_access(plane0->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowRectangle  plane1_access(plane1->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
-    AccessWindowRectangle  plane2_access(plane2->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
-    AccessWindowHorizontal plane3_access(plane3 == nullptr ? nullptr : plane3->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, plane0_access, plane1_access, plane2_access, plane3_access, output_access);
-
-    ValidRegion valid_region = intersect_valid_regions(plane0->info()->valid_region(),
-                                                       plane1->info()->valid_region(),
-                                                       plane2->info()->valid_region());
-    if(plane3 != nullptr)
-    {
-        valid_region = intersect_valid_regions(plane3->info()->valid_region(), valid_region);
-    }
-    output_access.set_valid_region(win, ValidRegion(valid_region.anchor, output->info()->tensor_shape()));
-
-    ICLKernel::configure_internal(win);
-}
-
-void CLChannelCombineKernel::configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), plane0, plane1, plane2, output);
-}
-
-void CLChannelCombineKernel::configure(const CLCompileContext &compile_context, const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane0);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane1);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane2);
-
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
-
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8);
-
-    const Format output_format = output->info()->format();
-
-    // Validate shape of Y plane to be even and shape of sub-sampling dimensions for U and V planes
-    // Perform validation only for formats which require sub-sampling.
-    if(Format::YUV444 != output_format)
-    {
-        // Validate Y plane of input and output
-        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(output_format, plane0, output->plane(0));
-
-        // Validate U and V plane of the input
-        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), plane1, plane2);
-
-        // Validate second plane U (NV12 and NV21 have a UV88 combined plane while IYUV has only the U plane)
-        // MultiImage generates the correct tensor shape but also check in case the tensor shape of planes was changed to a wrong size
-        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), output->plane(1));
-
-        // Validate the last plane V of format IYUV
-        if(Format::IYUV == output_format)
-        {
-            // Validate Y plane of the output
-            ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), output->plane(2));
-        }
-    }
-
-    // Set input tensors
-    _planes[0] = plane0;
-    _planes[1] = plane1;
-    _planes[2] = plane2;
-    _planes[3] = nullptr;
-
-    // Set output tensor
-    _output       = nullptr;
-    _output_multi = output;
-
-    bool has_two_planars = false;
-
-    // Set sub-sampling parameters for each plane
-    std::string           kernel_name;
-    std::set<std::string> build_opts;
-
-    if(Format::NV12 == output_format || Format::NV21 == output_format)
-    {
-        _x_subsampling = { { 1, 2, 2 } };
-        _y_subsampling = { { 1, 2, 2 } };
-        kernel_name    = "channel_combine_NV";
-        build_opts.emplace(Format::NV12 == output_format ? "-DNV12" : "-DNV21");
-        has_two_planars = true;
-    }
-    else
-    {
-        if(Format::IYUV == output_format)
-        {
-            _x_subsampling = { { 1, 2, 2 } };
-            _y_subsampling = { { 1, 2, 2 } };
-        }
-
-        kernel_name = "copy_planes_3p";
-        build_opts.emplace(Format::IYUV == output_format ? "-DIYUV" : "-DYUV444");
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts);
-
-    // Configure window
-    Window win = calculate_max_window(*plane0->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowRectangle input_plane0_access(plane0->info(), 0, 0, num_elems_processed_per_iteration, 1.f);
-    AccessWindowRectangle input_plane1_access(plane1->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
-    AccessWindowRectangle input_plane2_access(plane2->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
-    AccessWindowRectangle output_plane0_access(output->plane(0)->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f, 1.f / _y_subsampling[1]);
-    AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
-    AccessWindowRectangle output_plane2_access(has_two_planars ? nullptr : output->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
-
-    update_window_and_padding(win,
-                              input_plane0_access, input_plane1_access, input_plane2_access,
-                              output_plane0_access, output_plane1_access, output_plane2_access);
-
-    ValidRegion plane0_valid_region  = plane0->info()->valid_region();
-    ValidRegion output_plane1_region = has_two_planars ? intersect_valid_regions(plane1->info()->valid_region(), plane2->info()->valid_region()) : plane2->info()->valid_region();
-    output_plane0_access.set_valid_region(win, ValidRegion(plane0_valid_region.anchor, output->plane(0)->info()->tensor_shape()));
-    output_plane1_access.set_valid_region(win, ValidRegion(output_plane1_region.anchor, output->plane(1)->info()->tensor_shape()));
-    output_plane2_access.set_valid_region(win, ValidRegion(plane2->info()->valid_region().anchor, output->plane(2)->info()->tensor_shape()));
-
-    ICLKernel::configure_internal(win);
-}
-
-void CLChannelCombineKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    slice.set_dimension_step(Window::DimY, 1);
-
-    do
-    {
-        // Subsampling in plane 1
-        Window win_sub_plane1(slice);
-        win_sub_plane1.set(Window::DimX, Window::Dimension(win_sub_plane1.x().start() / _x_subsampling[1], win_sub_plane1.x().end() / _x_subsampling[1], win_sub_plane1.x().step() / _x_subsampling[1]));
-        win_sub_plane1.set(Window::DimY, Window::Dimension(win_sub_plane1.y().start() / _y_subsampling[1], win_sub_plane1.y().end() / _y_subsampling[1], 1));
-
-        // Subsampling in plane 2
-        Window win_sub_plane2(slice);
-        win_sub_plane2.set(Window::DimX, Window::Dimension(win_sub_plane2.x().start() / _x_subsampling[2], win_sub_plane2.x().end() / _x_subsampling[2], win_sub_plane2.x().step() / _x_subsampling[2]));
-        win_sub_plane2.set(Window::DimY, Window::Dimension(win_sub_plane2.y().start() / _y_subsampling[2], win_sub_plane2.y().end() / _y_subsampling[2], 1));
-
-        unsigned int idx = 0;
-
-        // Set inputs
-        add_2D_tensor_argument(idx, _planes[0], slice);
-        add_2D_tensor_argument(idx, _planes[1], win_sub_plane1);
-        add_2D_tensor_argument(idx, _planes[2], win_sub_plane2);
-        add_2D_tensor_argument_if((nullptr != _planes[3]), idx, _planes[3], slice);
-
-        // Set outputs
-        if(nullptr != _output) // Single planar output
-        {
-            add_2D_tensor_argument(idx, _output, slice);
-        }
-        else // Multi-planar output
-        {
-            // Reduce slice in case of subsampling to avoid out-of bounds access
-            slice.set(Window::DimY, Window::Dimension(slice.y().start() / _y_subsampling[1], slice.y().end() / _y_subsampling[1], 1));
-
-            add_2D_tensor_argument(idx, _output_multi->cl_plane(0), slice);
-            add_2D_tensor_argument(idx, _output_multi->cl_plane(1), win_sub_plane1);
-            add_2D_tensor_argument_if((3 == num_planes_from_format(_output_multi->info()->format())), idx, _output_multi->cl_plane(2), win_sub_plane2);
-
-            _kernel.setArg(idx++, slice.y().end());
-        }
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
-} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLChannelCombineKernel.h b/src/core/CL/kernels/CLChannelCombineKernel.h
deleted file mode 100644
index f19995a..0000000
--- a/src/core/CL/kernels/CLChannelCombineKernel.h
+++ /dev/null

@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCHANNELCOMBINEKERNEL_H
-#define ARM_COMPUTE_CLCHANNELCOMBINEKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-#include <array>
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLMultiImage;
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface for the channel combine kernel */
-class CLChannelCombineKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLChannelCombineKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLChannelCombineKernel(const CLChannelCombineKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLChannelCombineKernel &operator=(const CLChannelCombineKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLChannelCombineKernel(CLChannelCombineKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLChannelCombineKernel &operator=(CLChannelCombineKernel &&) = default;
-    /** Default destructor */
-    ~CLChannelCombineKernel() = default;
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  plane0 The 2D plane that forms channel 0. Must be of U8 format.
-     * @param[in]  plane1 The 2D plane that forms channel 1. Must be of U8 format.
-     * @param[in]  plane2 The 2D plane that forms channel 2. Must be of U8 format.
-     * @param[in]  plane3 The 2D plane that forms channel 3. Must be of U8 format.
-     * @param[out] output The single planar output tensor. Supported formats: RGB888/RGBA8888/YUYV422/UYVY422.
-     */
-    void configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output);
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  plane0          The 2D plane that forms channel 0. Must be of U8 format.
-     * @param[in]  plane1          The 2D plane that forms channel 1. Must be of U8 format.
-     * @param[in]  plane2          The 2D plane that forms channel 2. Must be of U8 format.
-     * @param[in]  plane3          The 2D plane that forms channel 3. Must be of U8 format.
-     * @param[out] output          The single planar output tensor.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output);
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  plane0 The 2D plane that forms channel 0. Must be of U8 format.
-     * @param[in]  plane1 The 2D plane that forms channel 1. Must be of U8 format.
-     * @param[in]  plane2 The 2D plane that forms channel 2. Must be of U8 format.
-     * @param[out] output The multi planar output tensor. Supported formats: RGB888/RGBA8888/YUYV422/UYVY422.
-     */
-    void configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output);
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  plane0          The 2D plane that forms channel 0. Must be of U8 format.
-     * @param[in]  plane1          The 2D plane that forms channel 1. Must be of U8 format.
-     * @param[in]  plane2          The 2D plane that forms channel 2. Must be of U8 format.
-     * @param[out] output          The multi planar output tensor. Supported formats: RGB888/RGBA8888/YUYV422/UYVY422.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    std::array<const ICLTensor *, 4> _planes;
-    ICLTensor     *_output;
-    ICLMultiImage *_output_multi;
-    std::array<uint32_t, 3> _x_subsampling;
-    std::array<uint32_t, 3> _y_subsampling;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLCHANNELCOMBINEKERNEL_H */

diff --git a/src/core/CL/kernels/CLChannelExtractKernel.cpp b/src/core/CL/kernels/CLChannelExtractKernel.cpp
deleted file mode 100644
index cbf504b..0000000
--- a/src/core/CL/kernels/CLChannelExtractKernel.cpp
+++ /dev/null

@@ -1,196 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLChannelExtractKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLMultiImage.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/MultiImageInfo.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLChannelExtractKernel::CLChannelExtractKernel()
-    : _input(nullptr), _output(nullptr), _num_elems_processed_per_iteration(8), _subsampling(1)
-{
-}
-
-void CLChannelExtractKernel::configure(const ICLTensor *input, Channel channel, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, channel, output);
-}
-
-void CLChannelExtractKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, Channel channel, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_ON(input == output);
-
-    set_format_if_unknown(*output->info(), Format::U8);
-
-    // Check if input tensor has a valid format
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::RGB888, Format::RGBA8888, Format::YUYV422, Format::UYVY422);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
-
-    // Check if channel is valid for given format
-    const Format format = input->info()->format();
-    ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(format, channel);
-
-    // Half the processed elements for U,V channels due to sub-sampling of 2
-    _subsampling = 1;
-
-    if(format == Format::YUYV422 || format == Format::UYVY422)
-    {
-        // Check if the width of the tensor shape is even for formats with subsampled channels (UYVY422 and YUYV422)
-        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(format, input);
-
-        if(channel != Channel::Y)
-        {
-            _subsampling = 2;
-        }
-    }
-
-    // Calculate output tensor shape using subsampling
-    TensorShape output_shape = calculate_subsampled_shape(input->info()->tensor_shape(), format, channel);
-    set_shape_if_empty(*output->info(), output_shape);
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-
-    _input  = input;
-    _output = output;
-
-    // Create kernel
-    std::string           kernel_name = "channel_extract_" + string_from_format(format);
-    std::set<std::string> build_opts  = { ("-DCHANNEL_" + string_from_channel(channel)) };
-    _kernel                           = create_kernel(compile_context, kernel_name, build_opts);
-
-    // Configure window
-    Window                 win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input->info(), 0, _num_elems_processed_per_iteration);
-    AccessWindowRectangle  output_access(output->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _subsampling, 1.f / _subsampling);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    ValidRegion input_valid_region = input->info()->valid_region();
-    output_access.set_valid_region(win, ValidRegion(input_valid_region.anchor, output->info()->tensor_shape()));
-
-    ICLKernel::configure_internal(win);
-}
-
-void CLChannelExtractKernel::configure(const ICLMultiImage *input, Channel channel, ICLImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, channel, output);
-}
-
-void CLChannelExtractKernel::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, Channel channel, ICLImage *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
-
-    set_format_if_unknown(*output->info(), Format::U8);
-
-    // Check if channel is valid for given format
-    const Format format = input->info()->format();
-    ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(format, channel);
-
-    // Get input plane from the given channel
-    const ICLImage *input_plane = input->cl_plane(plane_idx_from_channel(format, channel));
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input_plane);
-
-    if(Channel::Y == channel && format != Format::YUV444)
-    {
-        // Check if the width of the tensor shape is even for formats with subsampled channels (UYVY422 and YUYV422)
-        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(format, input_plane);
-    }
-
-    // Calculate 2x2 subsampled tensor shape
-    TensorShape output_shape = calculate_subsampled_shape(input->cl_plane(0)->info()->tensor_shape(), format, channel);
-    set_shape_if_empty(*output->info(), output_shape);
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output_shape, output->info()->tensor_shape());
-
-    // Check if input tensor has a valid format
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
-
-    _output      = output;
-    _input       = input_plane;
-    _subsampling = 1;
-
-    // Create kernel
-    std::string           kernel_name;
-    std::set<std::string> build_opts;
-    if(Channel::Y == channel || Format::IYUV == format || Format::YUV444 == format)
-    {
-        kernel_name = "copy_plane";
-    }
-    else
-    {
-        kernel_name = "channel_extract_" + string_from_format(format);
-        build_opts.insert(("-DCHANNEL_" + string_from_channel(channel)));
-    }
-    _kernel = create_kernel(compile_context, kernel_name, build_opts);
-
-    // Configure window
-    Window                 win = calculate_max_window(*input_plane->info(), Steps(_num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input_plane->info(), 0, _num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input_plane->info()->valid_region());
-
-    ICLKernel::configure_internal(win);
-}
-
-void CLChannelExtractKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-
-    do
-    {
-        Window win_sub(slice);
-        win_sub.set(Window::DimX, Window::Dimension(win_sub.x().start() / _subsampling, win_sub.x().end() / _subsampling, win_sub.x().step() / _subsampling));
-        win_sub.set(Window::DimY, Window::Dimension(win_sub.y().start() / _subsampling, win_sub.y().end() / _subsampling, 1));
-
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        add_2D_tensor_argument(idx, _output, win_sub);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}

diff --git a/src/core/CL/kernels/CLChannelExtractKernel.h b/src/core/CL/kernels/CLChannelExtractKernel.h
deleted file mode 100644
index 37abde5..0000000
--- a/src/core/CL/kernels/CLChannelExtractKernel.h
+++ /dev/null

@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCHANNELEXTRACTKERNEL_H
-#define ARM_COMPUTE_CLCHANNELEXTRACTKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLMultiImage;
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface for the channel extract kernel */
-class CLChannelExtractKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLChannelExtractKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLChannelExtractKernel(const CLChannelExtractKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLChannelExtractKernel &operator=(const CLChannelExtractKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLChannelExtractKernel(CLChannelExtractKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLChannelExtractKernel &operator=(CLChannelExtractKernel &&) = default;
-    /** Default destructor */
-    ~CLChannelExtractKernel() = default;
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input   Source tensor. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422
-     * @param[in]  channel Channel to extract.
-     * @param[out] output  Destination tensor. Must be of U8 format.
-     */
-    void configure(const ICLTensor *input, Channel channel, ICLTensor *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422
-     * @param[in]  channel         Channel to extract.
-     * @param[out] output          Destination tensor. Must be of U8 format.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, Channel channel, ICLTensor *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input   Multi-planar source image. Formats supported: NV12/NV21/IYUV/YUV444
-     * @param[in]  channel Channel to extract.
-     * @param[out] output  Single-planar 2D destination image. Must be of U8 format.
-     */
-    void configure(const ICLMultiImage *input, Channel channel, ICLImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Multi-planar source image. Formats supported: NV12/NV21/IYUV/YUV444
-     * @param[in]  channel         Channel to extract.
-     * @param[out] output          Single-planar 2D destination image. Must be of U8 format.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLMultiImage *input, Channel channel, ICLImage *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    uint32_t         _num_elems_processed_per_iteration;
-    uint32_t         _subsampling;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLCHANNELEXTRACTKERNEL_H */

diff --git a/src/core/CL/kernels/CLColorConvertKernel.cpp b/src/core/CL/kernels/CLColorConvertKernel.cpp
deleted file mode 100644
index 6c61fec..0000000
--- a/src/core/CL/kernels/CLColorConvertKernel.cpp
+++ /dev/null

@@ -1,558 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLColorConvertKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLMultiImage.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/MultiImageInfo.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <sstream>
-
-using namespace arm_compute;
-
-CLColorConvertKernel::CLColorConvertKernel()
-    : _input(nullptr), _output(nullptr), _multi_input(nullptr), _multi_output(nullptr)
-{
-}
-
-void CLColorConvertKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLColorConvertKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON(input == nullptr);
-    ARM_COMPUTE_ERROR_ON(output == nullptr);
-
-    unsigned int num_elems_processed_per_iteration = 0;
-    switch(input->info()->format())
-    {
-        case Format::RGBA8888:
-        {
-            switch(output->info()->format())
-            {
-                case Format::RGB888:
-                    num_elems_processed_per_iteration = 16;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        case Format::UYVY422:
-        case Format::YUYV422:
-        {
-            switch(output->info()->format())
-            {
-                case Format::RGB888:
-                case Format::RGBA8888:
-                    num_elems_processed_per_iteration = 8;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        case Format::RGB888:
-        {
-            switch(output->info()->format())
-            {
-                case Format::RGBA8888:
-                case Format::U8:
-                    num_elems_processed_per_iteration = 16;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        default:
-            break;
-    }
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
-                                 string_from_format(input->info()->format()).c_str(),
-                                 string_from_format(output->info()->format()).c_str());
-
-    std::stringstream kernel_name;
-
-    kernel_name << string_from_format(input->info()->format());
-    kernel_name << "_to_";
-    kernel_name << string_from_format(output->info()->format());
-    kernel_name << "_bt709";
-
-    _input  = input;
-    _output = output;
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name.str());
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name.str();
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-void CLColorConvertKernel::configure(const ICLMultiImage *input, ICLImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLColorConvertKernel::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLImage *output)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
-    ARM_COMPUTE_ERROR_ON(output == nullptr);
-
-    unsigned int num_elems_processed_per_iteration = 0;
-
-    switch(input->info()->format())
-    {
-        case Format::NV12:
-        case Format::NV21:
-        case Format::IYUV:
-        {
-            switch(output->info()->format())
-            {
-                case Format::RGB888:
-                case Format::RGBA8888:
-                    num_elems_processed_per_iteration = 4;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        default:
-            break;
-    }
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
-                                 string_from_format(input->info()->format()).c_str(),
-                                 string_from_format(output->info()->format()).c_str());
-
-    std::stringstream kernel_name;
-
-    kernel_name << string_from_format(input->info()->format());
-    kernel_name << "_to_";
-    kernel_name << string_from_format(output->info()->format());
-    kernel_name << "_bt709";
-
-    _multi_input = input;
-    _output      = output;
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name.str());
-
-    // Configure kernel window
-    const bool  has_two_planes = (input->info()->format() == Format::NV12) || (input->info()->format() == Format::NV21);
-    const float sub_sampling   = (has_two_planes || (input->info()->format() == Format::IYUV)) ? 0.5f : 1;
-
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-    win.set_dimension_step(Window::DimY, 2);
-
-    AccessWindowHorizontal plane0_access(input->plane(0)->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowRectangle  plane1_access(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1,
-                                         sub_sampling, sub_sampling);
-    AccessWindowRectangle plane2_access(has_two_planes ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1,
-                                        sub_sampling, sub_sampling);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win,
-                              plane0_access, plane1_access, plane2_access,
-                              output_access);
-
-    ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(), input->plane(1)->info()->valid_region(),
-                                                           input->plane(2)->info()->valid_region());
-    output_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->info()->tensor_shape()));
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name.str();
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->plane(0)->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->plane(0)->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->plane(0)->info()->dimension(1));
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->plane(1)->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->plane(1)->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->plane(1)->info()->dimension(1));
-}
-
-void CLColorConvertKernel::configure(const ICLImage *input, ICLMultiImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLColorConvertKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLMultiImage *output)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON(output == nullptr);
-
-    unsigned int num_elems_processed_per_iteration = 0;
-    unsigned int num_elems_read_per_iteration_x    = 0;
-
-    bool  has_two_planes = (output->info()->format() == Format::NV12) || (output->info()->format() == Format::NV21);
-    float sub_sampling   = (has_two_planes || (output->info()->format() == Format::IYUV)) ? 0.5f : 1;
-
-    switch(input->info()->format())
-    {
-        case Format::RGB888:
-        case Format::RGBA8888:
-        {
-            switch(output->info()->format())
-            {
-                case Format::NV12:
-                case Format::IYUV:
-                    num_elems_processed_per_iteration = 2;
-                    num_elems_read_per_iteration_x    = 8;
-                    break;
-                case Format::YUV444:
-                    num_elems_processed_per_iteration = 4;
-                    num_elems_read_per_iteration_x    = 16;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        case Format::UYVY422:
-        case Format::YUYV422:
-        {
-            switch(output->info()->format())
-            {
-                case Format::NV12:
-                case Format::IYUV:
-                    num_elems_processed_per_iteration = 8;
-                    num_elems_read_per_iteration_x    = 8;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        default:
-            break;
-    }
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
-                                 string_from_format(input->info()->format()).c_str(),
-                                 string_from_format(output->info()->format()).c_str());
-
-    std::stringstream kernel_name;
-
-    kernel_name << string_from_format(input->info()->format());
-    kernel_name << "_to_";
-    kernel_name << string_from_format(output->info()->format());
-    kernel_name << "_bt709";
-    _input        = input;
-    _multi_output = output;
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name.str());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    if((input->info()->format() != Format::RGB888 || output->info()->format() != Format::YUV444) && (input->info()->format() != Format::RGBA8888 || output->info()->format() != Format::YUV444))
-    {
-        win.set_dimension_step(Window::DimY, 2);
-    }
-
-    AccessWindowHorizontal output_plane0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowRectangle  output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling);
-    AccessWindowRectangle  output_plane2_access(has_two_planes ? nullptr : output->plane(2)->info(), 0, 0,
-                                                num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling);
-
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_read_per_iteration_x);
-
-    update_window_and_padding(win,
-                              input_access,
-                              output_plane0_access,
-                              output_plane1_access,
-                              output_plane2_access);
-
-    ValidRegion input_region = input->info()->valid_region();
-
-    output_plane0_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(0)->info()->tensor_shape()));
-    output_plane1_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(1)->info()->tensor_shape()));
-    output_plane2_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(2)->info()->tensor_shape()));
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name.str();
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-void CLColorConvertKernel::configure(const ICLMultiImage *input, ICLMultiImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLColorConvertKernel::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLMultiImage *output)
-{
-    unsigned int num_elems_processed_per_iteration = 0;
-    switch(input->info()->format())
-    {
-        case Format::NV12:
-        case Format::NV21:
-        {
-            switch(output->info()->format())
-            {
-                case Format::IYUV:
-                case Format::YUV444:
-                    num_elems_processed_per_iteration = 16;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        case Format::IYUV:
-        {
-            switch(output->info()->format())
-            {
-                case Format::YUV444:
-                case Format::NV12:
-                    num_elems_processed_per_iteration = 16;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        default:
-            break;
-    }
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
-                                 string_from_format(input->info()->format()).c_str(),
-                                 string_from_format(output->info()->format()).c_str());
-
-    std::stringstream kernel_name;
-
-    kernel_name << string_from_format(input->info()->format());
-    kernel_name << "_to_";
-    kernel_name << string_from_format(output->info()->format());
-    kernel_name << "_bt709";
-
-    _multi_input  = input;
-    _multi_output = output;
-
-    // Create kernel
-    bool has_two_input_planars  = (input->info()->format() == Format::NV12) || (input->info()->format() == Format::NV21);
-    bool has_two_output_planars = (output->info()->format() == Format::NV12) || (output->info()->format() == Format::NV21);
-
-    float sub_sampling_input  = (has_two_input_planars || (input->info()->format() == Format::IYUV)) ? 0.5f : 1;
-    float sub_sampling_output = (has_two_output_planars || (output->info()->format() == Format::IYUV)) ? 0.5f : 1;
-
-    _kernel = create_kernel(compile_context, kernel_name.str());
-
-    Window win = calculate_max_window(*input->cl_plane(0)->info(), Steps(num_elems_processed_per_iteration));
-    win.set_dimension_step(Window::DimY, 2);
-
-    AccessWindowHorizontal input_plane0_access(input->plane(0)->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowRectangle  input_plane1_access(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1,
-                                               sub_sampling_input, sub_sampling_input);
-    AccessWindowRectangle input_plane2_access(has_two_input_planars ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1,
-                                              sub_sampling_input, sub_sampling_input);
-    AccessWindowHorizontal output_plane0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowRectangle  output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling_output, sub_sampling_output);
-    AccessWindowRectangle  output_plane2_access(has_two_output_planars ? nullptr : output->plane(2)->info(), 0, 0,
-                                                num_elems_processed_per_iteration, 1, sub_sampling_output, sub_sampling_output);
-
-    update_window_and_padding(win,
-                              input_plane0_access, input_plane1_access, input_plane2_access,
-                              output_plane0_access, output_plane1_access, output_plane2_access);
-
-    ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(), input->plane(1)->info()->valid_region(),
-                                                           input->plane(2)->info()->valid_region());
-    output_plane0_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(0)->info()->tensor_shape()));
-    output_plane1_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(1)->info()->tensor_shape()));
-    output_plane2_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(2)->info()->tensor_shape()));
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name.str();
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->plane(0)->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->plane(0)->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->plane(0)->info()->dimension(1));
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->plane(1)->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->plane(1)->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->plane(1)->info()->dimension(1));
-}
-
-void CLColorConvertKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-
-    if(nullptr != _input && nullptr != _output)
-    {
-        do
-        {
-            unsigned int idx = 0;
-            add_2D_tensor_argument(idx, _input, slice);
-            add_2D_tensor_argument(idx, _output, slice);
-            enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window.slide_window_slice_2D(slice));
-    }
-    else if(nullptr != _input && nullptr != _multi_output)
-    {
-        Format format = _multi_output->info()->format();
-        do
-        {
-            Window win_uv(slice);
-
-            if((Format::NV12 == format) || (Format::NV21 == format) || (Format::IYUV == format))
-            {
-                win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
-                win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
-            }
-            unsigned int idx = 0;
-            add_2D_tensor_argument(idx, _input, slice);
-            add_2D_tensor_argument(idx, _multi_output->cl_plane(0), slice);
-            for(int i = 1; i < 3 && (0 != _multi_output->cl_plane(i)->info()->num_dimensions()); ++i)
-            {
-                add_2D_tensor_argument(idx, _multi_output->cl_plane(i), win_uv);
-            }
-            enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window.slide_window_slice_2D(slice));
-    }
-    else if(nullptr != _multi_input && nullptr != _output)
-    {
-        Format format = _multi_input->info()->format();
-        do
-        {
-            Window win_uv(slice);
-
-            if((Format::NV12 == format) || (Format::NV21 == format) || (Format::IYUV == format))
-            {
-                win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
-                win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
-            }
-
-            unsigned int idx = 0;
-            add_2D_tensor_argument(idx, _multi_input->cl_plane(0), slice);
-
-            for(int i = 1; i < 3 && (0 != _multi_input->cl_plane(i)->info()->num_dimensions()); ++i)
-            {
-                add_2D_tensor_argument(idx, _multi_input->cl_plane(i), win_uv);
-            }
-            add_2D_tensor_argument(idx, _output, slice);
-            enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window.slide_window_slice_2D(slice));
-    }
-    else if(nullptr != _multi_input && nullptr != _multi_output)
-    {
-        Format in_format  = _multi_input->info()->format();
-        Format out_format = _multi_output->info()->format();
-        do
-        {
-            Window win_in_uv(slice);
-            if((Format::NV12 == in_format) || (Format::NV21 == in_format) || (Format::IYUV == in_format))
-            {
-                win_in_uv.set(Window::DimX, Window::Dimension(win_in_uv.x().start() / 2,
-                                                              win_in_uv.x().end() / 2, win_in_uv.x().step() / 2));
-                win_in_uv.set(Window::DimY, Window::Dimension(win_in_uv.y().start() / 2, win_in_uv.y().end() / 2, 1));
-            }
-            unsigned int idx = 0;
-            add_2D_tensor_argument(idx, _multi_input->cl_plane(0), slice);
-            for(int i = 1; i < 3 && (0 != _multi_input->cl_plane(i)->info()->num_dimensions()); ++i)
-            {
-                add_2D_tensor_argument(idx, _multi_input->cl_plane(i), win_in_uv);
-            }
-
-            Window win_out_uv(slice);
-            if((Format::NV12 == out_format) || (Format::NV21 == out_format) || (Format::IYUV == out_format))
-            {
-                win_out_uv.set(Window::DimX, Window::Dimension(win_out_uv.x().start() / 2,
-                                                               win_out_uv.x().end() / 2, win_out_uv.x().step() / 2));
-                win_out_uv.set(Window::DimY, Window::Dimension(win_out_uv.y().start() / 2, win_out_uv.y().end() / 2, 1));
-            }
-
-            add_2D_tensor_argument(idx, _multi_output->cl_plane(0), slice);
-            for(int i = 1; i < 3 && (0 != _multi_output->cl_plane(i)->info()->num_dimensions()); ++i)
-            {
-                add_2D_tensor_argument(idx, _multi_output->cl_plane(i), win_out_uv);
-            }
-            enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window.slide_window_slice_2D(slice));
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not supported");
-    }
-}

diff --git a/src/core/CL/kernels/CLColorConvertKernel.h b/src/core/CL/kernels/CLColorConvertKernel.h
deleted file mode 100644
index 0f08291..0000000
--- a/src/core/CL/kernels/CLColorConvertKernel.h
+++ /dev/null

@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCOLORCONVERTKERNEL_H
-#define ARM_COMPUTE_CLCOLORCONVERTKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLMultiImage;
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface for the color convert kernel.
- *
- */
-class CLColorConvertKernel : public ICLKernel
-{
-public:
-    /** Default constructor. */
-    CLColorConvertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLColorConvertKernel(const CLColorConvertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLColorConvertKernel &operator=(const CLColorConvertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLColorConvertKernel(CLColorConvertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLColorConvertKernel &operator=(CLColorConvertKernel &&) = default;
-    /** Default destructor. */
-    ~CLColorConvertKernel() = default;
-
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888
-     * @param[out] output Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422),
-     *                                                          RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/),
-     *                                                          U8 (if the formats of @p input is RGB888)
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888
-     * @param[out] output          Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422),
-     *                                                          RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/),
-     *                                                          U8 (if the formats of @p input is RGB888)
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output Single-planar destination image. Formats supported: RGB888/RGBA8888
-     */
-    void configure(const ICLMultiImage *input, ICLImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output          Single-planar destination image. Formats supported: RGB888/RGBA8888
-     */
-    void configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
-     * @param[out] output Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888)
-     */
-    void configure(const ICLImage *input, ICLMultiImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
-     * @param[out] output          Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888)
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLMultiImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of  @p input is IYUV)
-     */
-    void configure(const ICLMultiImage *input, ICLMultiImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output          Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of  @p input is IYUV)
-     */
-    void configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLMultiImage *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor     *_input;        /*pointer to single planar tensor input */
-    ICLTensor           *_output;       /*pointer to single planar tensor output */
-    const ICLMultiImage *_multi_input;  /*pointer to multi-planar input */
-    ICLMultiImage       *_multi_output; /*pointer to multi-planar output */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLCOLORCONVERTKERNEL_H */

diff --git a/src/core/CL/kernels/CLConvolutionKernel.cpp b/src/core/CL/kernels/CLConvolutionKernel.cpp
deleted file mode 100644
index 21f1047..0000000
--- a/src/core/CL/kernels/CLConvolutionKernel.cpp
+++ /dev/null

@@ -1,392 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLConvolutionKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <sstream>
-#include <string>
-
-namespace arm_compute
-{
-namespace
-{
-constexpr unsigned int max_matrix_size = 81;
-} // namespace
-
-/****************************************************************************************\
- *                                 Square Convolution                                *
-\****************************************************************************************/
-
-template <unsigned int matrix_size>
-BorderSize             CLConvolutionKernel<matrix_size>::border_size() const
-{
-    return BorderSize(matrix_size / 2);
-}
-
-template <unsigned int matrix_size>
-void CLConvolutionKernel<matrix_size>::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, scale, border_undefined);
-}
-
-template <unsigned int matrix_size>
-void CLConvolutionKernel<matrix_size>::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON(conv == nullptr);
-
-    _input  = input;
-    _output = output;
-
-    std::stringstream kernel_name;
-    CLBuildOptions    build_opts;
-    kernel_name << "convolution" << matrix_size << "x" << matrix_size << "_static";
-
-    if(scale == 0)
-    {
-        scale = calculate_matrix_scale(conv, matrix_size);
-    }
-
-    for(unsigned int i = 0; i < matrix_size * matrix_size; i++)
-    {
-        std::stringstream mat_str;
-        mat_str << "-DMAT" << i << "=" << conv[i];
-        build_opts.add_option(mat_str.str());
-    }
-
-    build_opts.add_option("-DSCALE=" + support::cpp11::to_string(scale));
-
-    DataType data_type = data_type_for_convolution_matrix(conv, matrix_size * matrix_size);
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-
-    std::stringstream out_type;
-    out_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type());
-    build_opts.add_option(out_type.str());
-
-    _kernel = create_kernel(compile_context, kernel_name.str(), build_opts.options());
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_rows_read_per_iteration       = matrix_size;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-}
-
-/****************************************************************************************\
- *                                 Separable Convolution                                *
-\****************************************************************************************/
-template <unsigned int matrix_size>
-CLSeparableConvolutionHorKernel<matrix_size>::CLSeparableConvolutionHorKernel()
-    : _border_size(0)
-{
-}
-
-template <unsigned int matrix_size>
-BorderSize             CLSeparableConvolutionHorKernel<matrix_size>::border_size() const
-{
-    return _border_size;
-}
-
-template <unsigned int matrix_size>
-void CLSeparableConvolutionHorKernel<matrix_size>::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, border_undefined);
-}
-
-template <unsigned int matrix_size>
-void CLSeparableConvolutionHorKernel<matrix_size>::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::S16, DataType::S32);
-
-    ARM_COMPUTE_ERROR_ON((matrix_size != 5) && (matrix_size != 7) && (matrix_size != 9));
-
-    _input       = input;
-    _output      = output;
-    _border_size = BorderSize(border_undefined ? 0 : matrix_size / 2, matrix_size / 2);
-
-    // Set build options
-    std::set<std::string> build_opts;
-
-    std::array<int16_t, matrix_size *matrix_size> mat = { 0 };
-    memcpy(mat.data(), conv, matrix_size * sizeof(int16_t));
-
-    for(unsigned int j = 0; j < matrix_size * matrix_size; j++)
-    {
-        build_opts.insert("-DMAT" + support::cpp11::to_string(j) + "=" + support::cpp11::to_string(mat[j]));
-    }
-
-    build_opts.insert("-DSCALE=0");
-
-    build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
-
-    // Create kernel
-    const std::string kernel_name = "convolution_separable1x" + support::cpp11::to_string(matrix_size) + "_static";
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-
-    Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_undefined);
-}
-
-template <unsigned int matrix_size>
-BorderSize             CLSeparableConvolutionVertKernel<matrix_size>::border_size() const
-{
-    return BorderSize{ matrix_size / 2, 0 };
-}
-
-template <unsigned int matrix_size>
-void CLSeparableConvolutionVertKernel<matrix_size>::configure(const ICLTensor *input, ICLTensor *output,
-                                                              const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, scale, border_undefined, data_type);
-}
-
-template <unsigned int matrix_size>
-void CLSeparableConvolutionVertKernel<matrix_size>::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
-                                                              const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON((matrix_size != 5) && (matrix_size != 7) && (matrix_size != 9));
-    ARM_COMPUTE_ERROR_ON(scale == 0);
-
-    _input  = input;
-    _output = output;
-
-    std::set<std::string> build_opts;
-
-    std::array<int16_t, matrix_size *matrix_size> mat = { 0 };
-    memcpy(mat.data() + matrix_size, conv, matrix_size * sizeof(int16_t));
-
-    for(unsigned int j = 0; j < matrix_size * matrix_size; j++)
-    {
-        build_opts.insert("-DMAT" + support::cpp11::to_string(j) + "=" + support::cpp11::to_string(mat[j]));
-    }
-
-    build_opts.insert("-DSCALE=" + support::cpp11::to_string(scale));
-
-    build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-
-    build_opts.insert("-DCOMPUTE_TYPE=" + get_cl_type_from_data_type(data_type));
-
-    std::stringstream out_type;
-    out_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type());
-    build_opts.insert(out_type.str());
-
-    // Create kernel
-    const std::string kernel_name = "convolution_separable" + support::cpp11::to_string(matrix_size) + "x1_static";
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 8;
-    constexpr unsigned int num_rows_read_per_iteration       = matrix_size;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_access(input->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(data_type));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_undefined);
-}
-
-/****************************************************************************************\
- *                                 Rectangle Convolution                                *
-\****************************************************************************************/
-
-CLConvolutionRectangleKernel::CLConvolutionRectangleKernel()
-    : _border_size(0), _input(nullptr), _output(nullptr)
-{
-}
-
-BorderSize CLConvolutionRectangleKernel::border_size() const
-{
-    return _border_size;
-}
-
-void CLConvolutionRectangleKernel::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, width, height, scale, border_undefined);
-}
-
-void CLConvolutionRectangleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale,
-                                             bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON(nullptr == conv);
-    ARM_COMPUTE_ERROR_ON(3 != width && 5 != width && 7 != width && 9 != width);
-    ARM_COMPUTE_ERROR_ON(3 != height && 5 != height && 7 != height && 9 != height);
-    ARM_COMPUTE_ERROR_ON(0 == scale);
-
-    _input       = input;
-    _output      = output;
-    _border_size = BorderSize(height / 2, width / 2);
-
-    std::set<std::string> options;
-
-    std::stringstream output_type;
-    output_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type());
-    options.insert(output_type.str());
-
-    uint32_t matrix_size = width * height;
-
-    std::array<int16_t, max_matrix_size> mat = { 0 };
-
-    memcpy(mat.data(), conv, matrix_size * sizeof(int16_t));
-
-    for(unsigned int j = 0; j < max_matrix_size; j++)
-    {
-        options.insert("-DMAT" + support::cpp11::to_string(j) + "=" + support::cpp11::to_string(mat[j]));
-    }
-
-    options.insert("-DSCALE=" + support::cpp11::to_string(scale));
-
-    DataType data_type = data_type_for_convolution_matrix(conv, matrix_size);
-    options.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-
-    options.insert("-DMATRIX_WIDTH=" + support::cpp11::to_string(width));
-    options.insert("-DMATRIX_HEIGHT=" + support::cpp11::to_string(height));
-
-    _kernel = create_kernel(compile_context, "convolution_rectangle", options);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    const unsigned int     num_rows_read_per_iteration       = height;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-}
-
-void CLConvolutionRectangleKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        add_2D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
-
-template class arm_compute::CLConvolutionKernel<3>;
-template class arm_compute::CLConvolutionKernel<5>;
-template class arm_compute::CLConvolutionKernel<7>;
-template class arm_compute::CLConvolutionKernel<9>;
-template class arm_compute::CLSeparableConvolutionVertKernel<5>;
-template class arm_compute::CLSeparableConvolutionVertKernel<7>;
-template class arm_compute::CLSeparableConvolutionVertKernel<9>;
-template class arm_compute::CLSeparableConvolutionHorKernel<5>;
-template class arm_compute::CLSeparableConvolutionHorKernel<7>;
-template class arm_compute::CLSeparableConvolutionHorKernel<9>;
-} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLConvolutionKernel.h b/src/core/CL/kernels/CLConvolutionKernel.h
deleted file mode 100644
index 33e73ca..0000000
--- a/src/core/CL/kernels/CLConvolutionKernel.h
+++ /dev/null

@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCONVOLUTIONKERNEL_H
-#define ARM_COMPUTE_CLCONVOLUTIONKERNEL_H
-
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/****************************************************************************************\
- *                                    Square Convolution                                *
-\****************************************************************************************/
-
-/** Interface for the kernel to run an arbitrary size convolution on a tensor. (Currently supports 3x3, 5x5, 7x7 and 9x9).
- * The client can supply a convolution matrix \f$ C_{m,n} \f$.
- * @f{eqnarray}{
- *  k_0 &=& \frac{m}{2}  \\
- *  l_0 &=& \frac{n}{2}  \\
- *  sum &=& \sum_{k=0,l=0}^{k=m-1,l=n-1} input(x+k-k_0, y+l-l_0) C_{k,l}
- *  @f}
- *
- * @note The above equation for this function is similar to the default OpenCV Filter2D function,
- *       which actually computes a correlation and not a convolution.
- *       In case of a real convolution the convolution matrix should be flipped both horizontally and vertically.
- */
-template <unsigned int matrix_size>
-class CLConvolutionKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined);
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-
-/** Interface for the kernel which applies a 3x3 convolution to a tensor. */
-using CLConvolution3x3Kernel = CLConvolutionKernel<3>;
-/** Interface for the kernel which applies a 5x5 convolution to a tensor. */
-using CLConvolution5x5Kernel = CLConvolutionKernel<5>;
-/** Interface for the kernel which applies a 7x7 convolution to a tensor. */
-using CLConvolution7x7Kernel = CLConvolutionKernel<7>;
-/** Interface for the kernel which applies a 9x9 convolution to a tensor. */
-using CLConvolution9x9Kernel = CLConvolutionKernel<9>;
-
-/****************************************************************************************\
- *                              Separable Square Convolution                            *
-\****************************************************************************************/
-
-/** Kernel for the Horizontal pass of a Separable Convolution. Currently support 5x5, 7x7, 9x9 */
-template <unsigned int matrix_size>
-class CLSeparableConvolutionHorKernel : public ICLSimple2DKernel
-{
-public:
-    /** Default Constructor */
-    CLSeparableConvolutionHorKernel();
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined);
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: U16/S16/S32.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-
-private:
-    BorderSize _border_size; /**< Border size */
-};
-
-/** Interface for the kernel which applies a horizontal pass of 5x5 convolution to a tensor. */
-using CLSeparableConvolution5x5HorKernel = CLSeparableConvolutionHorKernel<5>;
-/** Interface for the kernel which applies a horizontal pass of 7x7 convolution to a tensor. */
-using CLSeparableConvolution7x7HorKernel = CLSeparableConvolutionHorKernel<7>;
-/** Interface for the kernel which applies a horizontal pass of 9x9 convolution to a tensor. */
-using CLSeparableConvolution9x9HorKernel = CLSeparableConvolutionHorKernel<9>;
-
-/** Kernel for the Vertical pass of a Separable Convolution. Currently supports 5x5, 7x7, 9x9 */
-template <unsigned int matrix_size>
-class CLSeparableConvolutionVertKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U16/S16/S32.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  scale            Scale of the convolution matrix.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     * @param[in]  data_type        Data type to use for intermeidate result. @sa data_type_for_convolution
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type = DataType::S32);
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U16/S16/S32.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  scale            Scale of the convolution matrix.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     * @param[in]  data_type        Data type to use for intermeidate result. @sa data_type_for_convolution
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type = DataType::S32);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-
-/** Interface for the kernel which applies a vertical pass of 5x5 convolution to a tensor. */
-using CLSeparableConvolution5x5VertKernel = CLSeparableConvolutionVertKernel<5>;
-/** Interface for the kernel which applies a vertical pass of 7x7 convolution to a tensor. */
-using CLSeparableConvolution7x7VertKernel = CLSeparableConvolutionVertKernel<7>;
-/** Interface for the kernel which applies a vertical pass of 9x9 convolution to a tensor. */
-using CLSeparableConvolution9x9VertKernel = CLSeparableConvolutionVertKernel<9>;
-
-/****************************************************************************************\
- *                                 Rectangle Convolution                                *
-\****************************************************************************************/
-
-/** Kernel for the running convolution on a rectangle matrix.
- *
- * @note Supports combinations of 3,5,7 and 9.
- */
-class CLConvolutionRectangleKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLConvolutionRectangleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLConvolutionRectangleKernel(const CLConvolutionRectangleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLConvolutionRectangleKernel &operator=(const CLConvolutionRectangleKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLConvolutionRectangleKernel(CLConvolutionRectangleKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLConvolutionRectangleKernel &operator=(CLConvolutionRectangleKernel &&) = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  width            Width of convolution matrix (Number of columns)
-     * @param[in]  height           Height of convolution matrix (Number of rows)
-     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined);
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  width            Width of convolution matrix (Number of columns)
-     * @param[in]  height           Height of convolution matrix (Number of rows)
-     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    BorderSize       _border_size;
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLCONVOLUTIONKERNEL_H */

diff --git a/src/core/CL/kernels/CLDerivativeKernel.cpp b/src/core/CL/kernels/CLDerivativeKernel.cpp
deleted file mode 100644
index 5ff1136..0000000
--- a/src/core/CL/kernels/CLDerivativeKernel.cpp
+++ /dev/null

@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLDerivativeKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLDerivativeKernel::CLDerivativeKernel()
-    : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_derivative_x(false), _run_derivative_y(false)
-{
-}
-
-BorderSize CLDerivativeKernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void CLDerivativeKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_undefined);
-}
-
-void CLDerivativeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
-    _run_derivative_x = output_x != nullptr;
-    _run_derivative_y = output_y != nullptr;
-
-    if(_run_derivative_x)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
-    }
-
-    if(_run_derivative_y)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
-    }
-
-    _input    = input;
-    _output_x = output_x;
-    _output_y = output_y;
-
-    // Set build options
-    std::set<std::string> build_opts;
-
-    if(_run_derivative_x)
-    {
-        build_opts.insert("-DGRAD_X");
-    }
-
-    if(_run_derivative_y)
-    {
-        build_opts.insert("-DGRAD_Y");
-    }
-
-    // Create kernel
-    const std::string kernel_name = std::string("derivative");
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-    constexpr unsigned int num_read_rows_per_iteration       = 3;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_access(input->info(), 0, 0, 0, 0);
-    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_processed_per_iteration);
-    if(_run_derivative_x && _run_derivative_y)
-    {
-        // TODO(COMPMID-415) Fix x-access input bug in CL kernel instead of '+2'
-        input_access = AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_processed_per_iteration + 2, num_read_rows_per_iteration);
-    }
-    else if(_run_derivative_x)
-    {
-        // TODO(COMPMID-415) Fix x-access input bug in CL kernel instead of '+2'
-        input_access = AccessWindowHorizontal(input->info(), -border_size().left, num_elems_processed_per_iteration + 2);
-    }
-    else if(_run_derivative_y)
-    {
-        input_access = AccessWindowRectangle(input->info(), 0, -border_size().top, num_elems_processed_per_iteration, num_read_rows_per_iteration);
-    }
-
-    update_window_and_padding(win,
-                              input_access,
-                              output_x_access,
-                              output_y_access);
-
-    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_undefined);
-}
-
-void CLDerivativeKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        add_2D_tensor_argument_if((_run_derivative_x), idx, _output_x, slice);
-        add_2D_tensor_argument_if((_run_derivative_y), idx, _output_y, slice);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}

diff --git a/src/core/CL/kernels/CLDerivativeKernel.h b/src/core/CL/kernels/CLDerivativeKernel.h
deleted file mode 100644
index 14dd05d..0000000
--- a/src/core/CL/kernels/CLDerivativeKernel.h
+++ /dev/null

@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDERIVATIVEKERNEL_H
-#define ARM_COMPUTE_CLDERIVATIVEKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the derivative kernel. */
-class CLDerivativeKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLDerivativeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDerivativeKernel(const CLDerivativeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDerivativeKernel &operator=(const CLDerivativeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLDerivativeKernel(CLDerivativeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLDerivativeKernel &operator=(CLDerivativeKernel &&) = default;
-    /** Default destructor */
-    ~CLDerivativeKernel() = default;
-    /** Initialise the kernel's sources, destination and border
-     *
-     * @note At least one of output_x or output_y must be set
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's sources, destination and border
-     *
-     * @note At least one of output_x or output_y must be set
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input;            /**< Input tensor */
-    ICLTensor       *_output_x;         /**< Output tensor - Derivate along the X direction */
-    ICLTensor       *_output_y;         /**< Output tensor - Derivate along the Y direction */
-    bool             _run_derivative_x; /**< Do we need to run Derivative X ? */
-    bool             _run_derivative_y; /**< Do we need to run Derivative Y ? */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDERIVATIVEKERNEL_H */

diff --git a/src/core/CL/kernels/CLDilateKernel.cpp b/src/core/CL/kernels/CLDilateKernel.cpp
deleted file mode 100644
index cac5bc1..0000000
--- a/src/core/CL/kernels/CLDilateKernel.cpp
+++ /dev/null

@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLDilateKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-using namespace arm_compute;
-
-BorderSize CLDilateKernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void CLDilateKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined);
-}
-
-void CLDilateKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "dilate");
-
-    _input  = input;
-    _output = output;
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-}

diff --git a/src/core/CL/kernels/CLDilateKernel.h b/src/core/CL/kernels/CLDilateKernel.h
deleted file mode 100644
index 591ec8c..0000000
--- a/src/core/CL/kernels/CLDilateKernel.h
+++ /dev/null

@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDILATEKERNEL_H
-#define ARM_COMPUTE_CLDILATEKERNEL_H
-
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the dilate kernel.
- *
- */
-class CLDilateKernel : public ICLSimple2DKernel
-{
-public:
-    /**Initialise the kernel's input and output.
-     *
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /**Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDILATEKERNEL_H */

diff --git a/src/core/CL/kernels/CLErodeKernel.cpp b/src/core/CL/kernels/CLErodeKernel.cpp
deleted file mode 100644
index f6d98a5..0000000
--- a/src/core/CL/kernels/CLErodeKernel.cpp
+++ /dev/null

@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLErodeKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-using namespace arm_compute;
-
-BorderSize CLErodeKernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void CLErodeKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined);
-}
-
-void CLErodeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "erode");
-
-    _input  = input;
-    _output = output;
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_rows_read_pes_iteration       = 3;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_pes_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-}

diff --git a/src/core/CL/kernels/CLErodeKernel.h b/src/core/CL/kernels/CLErodeKernel.h
deleted file mode 100644
index 4da97ae..0000000
--- a/src/core/CL/kernels/CLErodeKernel.h
+++ /dev/null

@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLERODEKERNEL_H
-#define ARM_COMPUTE_CLERODEKERNEL_H
-
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the erode kernel.
- *
- */
-class CLErodeKernel : public ICLSimple2DKernel
-{
-public:
-    /**Initialise the kernel's input and output.
-     *
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /**Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLERODEKERNEL_H */

diff --git a/src/core/CL/kernels/CLFastCornersKernel.cpp b/src/core/CL/kernels/CLFastCornersKernel.cpp
deleted file mode 100644
index 7481fd1..0000000
--- a/src/core/CL/kernels/CLFastCornersKernel.cpp
+++ /dev/null

@@ -1,209 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLFastCornersKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLFastCornersKernel::CLFastCornersKernel()
-    : ICLKernel(), _input(nullptr), _output(nullptr)
-{
-}
-
-BorderSize CLFastCornersKernel::border_size() const
-{
-    return BorderSize(3);
-}
-
-void CLFastCornersKernel::configure(const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, threshold, non_max_suppression, border_mode);
-}
-
-void CLFastCornersKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_MSG(border_mode != BorderMode::UNDEFINED, "Not implemented");
-
-    _input  = input;
-    _output = output;
-
-    // Create build options
-    std::set<std::string> build_opts;
-
-    if(non_max_suppression)
-    {
-        build_opts.emplace("-DUSE_MAXSUPPRESSION");
-    }
-
-    // Create kernel
-    const std::string kernel_name = std::string("fast_corners");
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts);
-
-    // Set static kernel arguments
-    unsigned int idx = 2 * num_arguments_per_2D_tensor(); // Skip the input and output parameters
-    _kernel.setArg<cl_float>(idx, static_cast<float>(threshold));
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-    constexpr unsigned int num_elems_read_per_iteration      = 7;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_mode == BorderMode::UNDEFINED, BorderSize(3));
-
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_mode == BorderMode::UNDEFINED, border_size());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(non_max_suppression);
-    _config_id += "_";
-    _config_id += lower_string(string_from_border_mode(border_mode));
-}
-
-void CLFastCornersKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        add_2D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
-
-CLCopyToArrayKernel::CLCopyToArrayKernel()
-    : ICLKernel(), _input(nullptr), _corners(nullptr), _num_buffer(nullptr)
-{
-}
-
-void CLCopyToArrayKernel::configure(const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, update_number, corners, num_buffers);
-}
-
-void CLCopyToArrayKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(corners == nullptr);
-    ARM_COMPUTE_ERROR_ON(num_buffers == nullptr);
-
-    _input      = input;
-    _corners    = corners;
-    _num_buffer = num_buffers;
-
-    std::set<std::string> build_opts;
-
-    if(update_number)
-    {
-        build_opts.emplace("-DUPDATE_NUMBER");
-    }
-
-    // Create kernel
-    const std::string kernel_name = std::string("copy_to_keypoint");
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts);
-
-    //Get how many pixels skipped in the x dimension in the previous stages
-    unsigned int offset = _input->info()->valid_region().anchor.x();
-
-    // Set static kernel arguments
-    unsigned int idx = num_arguments_per_2D_tensor(); // Skip the input and output parameters
-    _kernel.setArg<unsigned int>(idx++, _corners->max_num_values());
-    _kernel.setArg<cl_uint>(idx++, offset);
-    _kernel.setArg(idx++, *_num_buffer);
-    _kernel.setArg(idx++, _corners->cl_buffer());
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-    Window                 win                               = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-void CLCopyToArrayKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    //Initialise the _num_buffer as it used as both input and output
-    static const unsigned int zero_init = 0;
-    queue.enqueueWriteBuffer(*_num_buffer, CL_FALSE, 0, sizeof(unsigned int), &zero_init);
-
-    Window slice = window.first_slice_window_2D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}

diff --git a/src/core/CL/kernels/CLFastCornersKernel.h b/src/core/CL/kernels/CLFastCornersKernel.h
deleted file mode 100644
index 0c1b564..0000000
--- a/src/core/CL/kernels/CLFastCornersKernel.h
+++ /dev/null

@@ -1,133 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLFASTCORNERSKERNEL_H
-#define ARM_COMPUTE_CLFASTCORNERSKERNEL_H
-
-#include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-#include <cstdint>
-
-namespace cl
-{
-class Buffer;
-}
-
-namespace arm_compute
-{
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** CL kernel to perform fast corners */
-class CLFastCornersKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLFastCornersKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFastCornersKernel(const CLFastCornersKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFastCornersKernel &operator=(const CLFastCornersKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLFastCornersKernel(CLFastCornersKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLFastCornersKernel &operator=(CLFastCornersKernel &&) = default;
-    /** Default destructor */
-    ~CLFastCornersKernel() = default;
-
-    /** Initialise the kernel.
-     *
-     * @param[in]  input               Source image. Data types supported: U8.
-     * @param[out] output              Output image. Data types supported: U8.
-     * @param[in]  threshold           Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
-     * @param[in]  non_max_suppression True if non-maxima suppresion is applied, false otherwise.
-     * @param[in]  border_mode         Strategy to use for borders.
-     */
-    void configure(const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode);
-    /** Initialise the kernel.
-     *
-     * @param[in]  compile_context     The compile context to be used.
-     * @param[in]  input               Source image. Data types supported: U8.
-     * @param[out] output              Output image. Data types supported: U8.
-     * @param[in]  threshold           Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
-     * @param[in]  non_max_suppression True if non-maxima suppresion is applied, false otherwise.
-     * @param[in]  border_mode         Strategy to use for borders.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode);
-
-    // Inherited methods overridden
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLImage *_input;
-    ICLImage       *_output;
-};
-
-/** CL kernel to copy keypoints information to ICLKeyPointArray and counts the number of key points */
-class CLCopyToArrayKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLCopyToArrayKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLCopyToArrayKernel(const CLCopyToArrayKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLCopyToArrayKernel &operator=(const CLCopyToArrayKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLCopyToArrayKernel(CLCopyToArrayKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLCopyToArrayKernel &operator=(CLCopyToArrayKernel &&) = default;
-    /** Default destructor */
-    ~CLCopyToArrayKernel() = default;
-
-    /** Initialise the kernel.
-     *
-     * @param[in]  input         Source image. Data types supported: U8.
-     * @param[in]  update_number Flag to indicate whether we need to update the number of corners
-     * @param[out] corners       Array of keypoints to store the results.
-     * @param[out] num_buffers   Number of keypoints to store the results.
-     */
-    void configure(const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers);
-    /** Initialise the kernel.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source image. Data types supported: U8.
-     * @param[in]  update_number   Flag to indicate whether we need to update the number of corners
-     * @param[out] corners         Array of keypoints to store the results.
-     * @param[out] num_buffers     Number of keypoints to store the results.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLImage   *_input;      /**< source image */
-    ICLKeyPointArray *_corners;    /**< destination array */
-    cl::Buffer       *_num_buffer; /**< CL memory to record number of key points in the array */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLFASTCORNERSKERNEL_H */

diff --git a/src/core/CL/kernels/CLGaussian3x3Kernel.cpp b/src/core/CL/kernels/CLGaussian3x3Kernel.cpp
deleted file mode 100644
index 40e9658..0000000
--- a/src/core/CL/kernels/CLGaussian3x3Kernel.cpp
+++ /dev/null

@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGaussian3x3Kernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-BorderSize CLGaussian3x3Kernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void CLGaussian3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined);
-}
-
-void CLGaussian3x3Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
-    _input  = input;
-    _output = output;
-
-    // Set build options
-    std::set<std::string> build_opts = { "-DMAT0=1", "-DMAT1=2", "-DMAT2=1",
-                                         "-DMAT3=2", "-DMAT4=4", "-DMAT5=2",
-                                         "-DMAT6=1", "-DMAT7=2", "-DMAT8=1",
-                                         "-DSCALE=16", "-DDATA_TYPE_OUT=uchar"
-                                       };
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "convolution3x3_static", build_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-}

diff --git a/src/core/CL/kernels/CLGaussian3x3Kernel.h b/src/core/CL/kernels/CLGaussian3x3Kernel.h
deleted file mode 100644
index 139b05d..0000000
--- a/src/core/CL/kernels/CLGaussian3x3Kernel.h
+++ /dev/null

@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGAUSSIAN3X3KERNEL_H
-#define ARM_COMPUTE_CLGAUSSIAN3X3KERNEL_H
-
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the Gaussian 3x3 filter kernel.
- *
- */
-class CLGaussian3x3Kernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGAUSSIAN3X3KERNEL_H */

diff --git a/src/core/CL/kernels/CLGaussian5x5Kernel.cpp b/src/core/CL/kernels/CLGaussian5x5Kernel.cpp
deleted file mode 100644
index 46a7576..0000000
--- a/src/core/CL/kernels/CLGaussian5x5Kernel.cpp
+++ /dev/null

@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGaussian5x5Kernel.h"
-
-#include <cstdint>
-
-using namespace arm_compute;
-
-void CLGaussian5x5HorKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined);
-}
-
-void CLGaussian5x5HorKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    const std::array<int16_t, 5> matrix = { 1, 4, 6, 4, 1 };
-
-    // Set arguments
-    CLSeparableConvolution5x5HorKernel::configure(compile_context, input, output, matrix.data(), border_undefined);
-}
-
-void CLGaussian5x5VertKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined);
-}
-
-void CLGaussian5x5VertKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    const uint32_t scale = 256;
-    const std::array<int16_t, 5> matrix = { 1, 4, 6, 4, 1 };
-
-    // Set arguments
-    CLSeparableConvolution5x5VertKernel::configure(compile_context, input, output, matrix.data(), scale, border_undefined);
-}

diff --git a/src/core/CL/kernels/CLGaussian5x5Kernel.h b/src/core/CL/kernels/CLGaussian5x5Kernel.h
deleted file mode 100644
index 711710b..0000000
--- a/src/core/CL/kernels/CLGaussian5x5Kernel.h
+++ /dev/null

@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGAUSSIAN5X5KERNEL_H
-#define ARM_COMPUTE_CLGAUSSIAN5X5KERNEL_H
-
-#include "src/core/CL/kernels/CLConvolutionKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run the horizontal pass of 5x5 Gaussian filter on a tensor. */
-class CLGaussian5x5HorKernel : public CLSeparableConvolution5x5HorKernel
-{
-public:
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor. Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor. Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-private:
-    //Make the configure method of the parent class private
-    using CLSeparableConvolution5x5HorKernel::configure;
-};
-
-/** Interface for the kernel to run the vertical pass of 5x5 Gaussian filter on a tensor. */
-class CLGaussian5x5VertKernel : public CLSeparableConvolution5x5VertKernel
-{
-public:
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @param[in]  input            Input tensor(output of horizontal pass). Data types supported: S16.
-     * @param[out] output           Destination tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Input tensor(output of horizontal pass). Data types supported: S16.
-     * @param[out] output           Destination tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-private:
-    //Make the configure method of the parent class private
-    using CLSeparableConvolution5x5VertKernel::configure;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGAUSSIAN5X5KERNEL_H */

diff --git a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
deleted file mode 100644
index 065f7f7..0000000
--- a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
+++ /dev/null

@@ -1,247 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGaussianPyramidKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-CLGaussianPyramidHorKernel::CLGaussianPyramidHorKernel()
-    : _l2_load_offset(0)
-{
-}
-
-BorderSize CLGaussianPyramidHorKernel::border_size() const
-{
-    return BorderSize{ 0, 2 };
-}
-
-void CLGaussianPyramidHorKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLGaussianPyramidHorKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
-
-    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
-    }
-
-    _input  = input;
-    _output = output;
-
-    // Create kernel
-    const std::string kernel_name = std::string("gaussian1x5_sub_x");
-    _kernel                       = create_kernel(compile_context, kernel_name);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-    constexpr unsigned int num_elems_read_per_iteration      = 20;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    const float            scale_x                           = static_cast<float>(output->info()->dimension(0)) / input->info()->dimension(0);
-
-    Window                 win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration, scale_x);
-
-    // Sub sampling selects odd pixels (1, 3, 5, ...) for images with even
-    // width and even pixels (0, 2, 4, ...) for images with odd width. (Whether
-    // a pixel is even or odd is determined based on the tensor shape not the
-    // valid region!)
-    // Thus the offset from which the first pixel (L2) for the convolution is
-    // loaded depends on the anchor and shape of the valid region.
-    // In the case of an even shape (= even image width) we need to load L2
-    // from -2 if the anchor is odd and from -1 if the anchor is even. That
-    // makes sure that L2 is always loaded from an odd pixel.
-    // On the other hand, for an odd shape (= odd image width) we need to load
-    // L2 from -1 if the anchor is odd and from -2 if the anchor is even to
-    // achieve the opposite effect.
-    // The condition can be simplified to checking whether anchor + shape is
-    // odd (-2) or even (-1) as only adding an odd and an even number will have
-    // an odd result.
-    _l2_load_offset = -border_size().left;
-
-    if((_input->info()->valid_region().anchor[0] + _input->info()->valid_region().shape[0]) % 2 == 0)
-    {
-        _l2_load_offset += 1;
-    }
-
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input->info(), _l2_load_offset, num_elems_read_per_iteration),
-                              output_access);
-
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-void CLGaussianPyramidHorKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window win_in(window);
-    win_in.shift(Window::DimX, _l2_load_offset);
-
-    //The output is half the width of the input:
-    Window win_out(window);
-    win_out.scale(Window::DimX, 0.5f);
-
-    Window slice_in  = win_in.first_slice_window_2D();
-    Window slice_out = win_out.first_slice_window_2D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice_in);
-        add_2D_tensor_argument(idx, _output, slice_out);
-        enqueue(queue, *this, slice_out, lws_hint());
-    }
-    while(win_in.slide_window_slice_2D(slice_in) && win_out.slide_window_slice_2D(slice_out));
-}
-
-CLGaussianPyramidVertKernel::CLGaussianPyramidVertKernel()
-    : _t2_load_offset(0)
-{
-}
-
-BorderSize CLGaussianPyramidVertKernel::border_size() const
-{
-    return BorderSize{ 2, 0 };
-}
-
-void CLGaussianPyramidVertKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLGaussianPyramidVertKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
-
-    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
-    }
-
-    _input  = input;
-    _output = output;
-
-    // Create kernel
-    const std::string kernel_name = std::string("gaussian5x1_sub_y");
-    _kernel                       = create_kernel(compile_context, "gaussian5x1_sub_y");
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_rows_processed_per_iteration  = 2;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 8;
-    constexpr unsigned int num_rows_per_iteration            = 5;
-
-    const float scale_y = static_cast<float>(output->info()->dimension(1)) / input->info()->dimension(1);
-
-    Window                win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_rows_processed_per_iteration));
-    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_per_iteration, 1.f, scale_y);
-
-    // Determine whether we need to load even or odd rows. See above for a
-    // detailed explanation.
-    _t2_load_offset = -border_size().top;
-
-    if((_input->info()->valid_region().anchor[1] + _input->info()->valid_region().shape[1]) % 2 == 0)
-    {
-        _t2_load_offset += 1;
-    }
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input->info(), 0, _t2_load_offset, num_elems_read_per_iteration, num_rows_per_iteration),
-                              output_access);
-
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-void CLGaussianPyramidVertKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(window.x().step() != 8);
-    ARM_COMPUTE_ERROR_ON(window.y().step() % 2);
-
-    Window win_in(window);
-    win_in.shift(Window::DimY, _t2_load_offset);
-
-    Window win_out(window);
-    win_out.scale(Window::DimY, 0.5f);
-
-    Window slice_in  = win_in.first_slice_window_2D();
-    Window slice_out = win_out.first_slice_window_2D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice_in);
-        add_2D_tensor_argument(idx, _output, slice_out);
-        enqueue(queue, *this, slice_out, lws_hint());
-    }
-    while(win_in.slide_window_slice_2D(slice_in) && win_out.slide_window_slice_2D(slice_out));
-}

diff --git a/src/core/CL/kernels/CLGaussianPyramidKernel.h b/src/core/CL/kernels/CLGaussianPyramidKernel.h
deleted file mode 100644
index a659544..0000000
--- a/src/core/CL/kernels/CLGaussianPyramidKernel.h
+++ /dev/null

@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H
-#define ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H
-
-#include "src/core/CL/ICLSimpleKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform a Gaussian filter and half scaling across width (horizontal pass) */
-class CLGaussianPyramidHorKernel : public ICLSimpleKernel
-{
-public:
-    /** Default constructor */
-    CLGaussianPyramidHorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGaussianPyramidHorKernel(const CLGaussianPyramidHorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGaussianPyramidHorKernel &operator=(const CLGaussianPyramidHorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGaussianPyramidHorKernel(CLGaussianPyramidHorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGaussianPyramidHorKernel &operator=(CLGaussianPyramidHorKernel &&) = default;
-    /** Default destructor */
-    ~CLGaussianPyramidHorKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  input  Source tensor. Data types supported: U8.
-     * @param[out] output Destination tensor. Output should have half the input width. Data types supported: U16.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8.
-     * @param[out] output          Destination tensor. Output should have half the input width. Data types supported: U16.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    int _l2_load_offset;
-};
-
-/** OpenCL kernel to perform a Gaussian filter and half scaling across height (vertical pass) */
-class CLGaussianPyramidVertKernel : public ICLSimpleKernel
-{
-public:
-    /** Default constructor */
-    CLGaussianPyramidVertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGaussianPyramidVertKernel(const CLGaussianPyramidVertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGaussianPyramidVertKernel &operator=(const CLGaussianPyramidVertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGaussianPyramidVertKernel(CLGaussianPyramidVertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGaussianPyramidVertKernel &operator=(CLGaussianPyramidVertKernel &&) = default;
-    /** Default destructor */
-    ~CLGaussianPyramidVertKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  input  Source tensor. Data types supported: U16.
-     * @param[out] output Destination tensor. Output should have half the input height. Data types supported: U8.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U16.
-     * @param[out] output          Destination tensor. Output should have half the input height. Data types supported: U8.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    int _t2_load_offset;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H */

diff --git a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
deleted file mode 100644
index cd3f1ee..0000000
--- a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
+++ /dev/null

@@ -1,237 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLHOGDescriptorKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <sstream>
-#include <string>
-
-using namespace arm_compute;
-
-CLHOGOrientationBinningKernel::CLHOGOrientationBinningKernel()
-    : _input_magnitude(nullptr), _input_phase(nullptr), _output(nullptr), _cell_size()
-{
-}
-
-void CLHOGOrientationBinningKernel::configure(const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input_magnitude, input_phase, output, hog_info);
-}
-
-void CLHOGOrientationBinningKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_magnitude, 1, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_phase, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins(), DataType::F32);
-    ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimX) != input_phase->info()->dimension(Window::DimX));
-    ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimY) != input_phase->info()->dimension(Window::DimY));
-
-    _input_magnitude = input_magnitude;
-    _input_phase     = input_phase;
-    _output          = output;
-    _cell_size       = hog_info->cell_size();
-
-    float phase_scale = (PhaseType::SIGNED == hog_info->phase_type() ? hog_info->num_bins() / 360.0f : hog_info->num_bins() / 180.0f);
-    phase_scale *= (PhaseType::SIGNED == hog_info->phase_type() ? 360.0f / 255.0f : 1.0f);
-
-    std::stringstream args_str;
-    args_str << "-DCELL_WIDTH=" << hog_info->cell_size().width << " ";
-    args_str << "-DCELL_HEIGHT=" << hog_info->cell_size().height << " ";
-    args_str << "-DNUM_BINS=" << hog_info->num_bins() << " ";
-    args_str << "-DPHASE_SCALE=" << phase_scale << " ";
-
-    // Construct kernel name
-    std::set<std::string> build_opts = {};
-    build_opts.insert(args_str.str());
-
-    // Create kernel
-    const std::string kernel_name = std::string("hog_orientation_binning");
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts);
-
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-    constexpr unsigned int num_elems_read_per_iteration      = 1;
-    const unsigned int     num_rows_read_per_iteration       = hog_info->cell_size().height;
-    constexpr unsigned int num_elems_written_per_iteration   = 1;
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input_magnitude->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              AccessWindowRectangle(input_phase->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              output_access);
-
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input_magnitude->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input_magnitude->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input_magnitude->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-void CLHOGOrientationBinningKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        // Compute slice for the magnitude and phase tensors
-        Window slice_mag_phase = window.first_slice_window_2D();
-        slice_mag_phase.set(Window::DimX, Window::Dimension(window.x().start() * _cell_size.width, window.x().start() * _cell_size.width, _cell_size.width));
-        slice_mag_phase.set(Window::DimY, Window::Dimension(window.y().start() * _cell_size.height, window.y().start() * _cell_size.height, _cell_size.height));
-
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input_magnitude, slice_mag_phase);
-        add_2D_tensor_argument(idx, _input_phase, slice_mag_phase);
-        add_2D_tensor_argument(idx, _output, slice);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
-
-CLHOGBlockNormalizationKernel::CLHOGBlockNormalizationKernel()
-    : _input(nullptr), _output(nullptr), _num_cells_per_block_stride()
-{
-}
-
-void CLHOGBlockNormalizationKernel::configure(const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, hog_info);
-}
-
-void CLHOGBlockNormalizationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info)
-{
-    ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, hog_info->num_bins(), DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
-
-    // Number of cells per block
-    const Size2D num_cells_per_block(hog_info->block_size().width / hog_info->cell_size().width,
-                                     hog_info->block_size().height / hog_info->cell_size().height);
-
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins() * num_cells_per_block.area(), DataType::F32);
-
-    // Number of cells per block stride
-    const Size2D num_cells_per_block_stride(hog_info->block_stride().width / hog_info->cell_size().width,
-                                            hog_info->block_stride().height / hog_info->cell_size().height);
-
-    _input                      = input;
-    _output                     = output;
-    _num_cells_per_block_stride = num_cells_per_block_stride;
-
-    std::stringstream args_str;
-    args_str << "-DL2_HYST_THRESHOLD=" << hog_info->l2_hyst_threshold() << " ";
-    args_str << "-DNUM_CELLS_PER_BLOCK_HEIGHT=" << num_cells_per_block.height << " ";
-    args_str << "-DNUM_BINS_PER_BLOCK_X=" << num_cells_per_block.width *hog_info->num_bins() << " ";
-    args_str << "-DNUM_BINS_PER_BLOCK=" << _output->info()->num_channels() << " ";
-    args_str << "-DL2_NORM=" << static_cast<int>(HOGNormType::L2_NORM) << " ";
-    args_str << "-DL1_NORM=" << static_cast<int>(HOGNormType::L1_NORM) << " ";
-    args_str << "-DL2HYS_NORM=" << static_cast<int>(HOGNormType::L2HYS_NORM) << " ";
-    args_str << "-DHOG_NORM_TYPE=" << static_cast<int>(hog_info->normalization_type()) << " ";
-
-    // Construct kernel name
-    std::set<std::string> build_opts = {};
-    build_opts.insert(args_str.str());
-
-    const std::string kernel_name = std::string("hog_block_normalization");
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts);
-
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-    constexpr unsigned int num_elems_read_per_iteration      = 1;
-    const unsigned int     num_rows_read_per_iteration       = num_cells_per_block.height;
-    constexpr unsigned int num_elems_written_per_iteration   = 1;
-    const unsigned int     num_rows_written_per_iteration    = num_cells_per_block.height;
-
-    // Configure kernel window
-    Window                win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-void CLHOGBlockNormalizationKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        // Compute slice for the magnitude and phase tensors
-        Window slice_in = window.first_slice_window_2D();
-        slice_in.set_dimension_step(Window::DimX, _num_cells_per_block_stride.width);
-        slice_in.set_dimension_step(Window::DimY, _num_cells_per_block_stride.height);
-
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice_in);
-        add_2D_tensor_argument(idx, _output, slice);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}

diff --git a/src/core/CL/kernels/CLHOGDescriptorKernel.h b/src/core/CL/kernels/CLHOGDescriptorKernel.h
deleted file mode 100644
index eee2fa3..0000000
--- a/src/core/CL/kernels/CLHOGDescriptorKernel.h
+++ /dev/null

@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H
-#define ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H
-
-#include "arm_compute/core/IHOG.h"
-#include "arm_compute/core/Size2D.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** OpenCL kernel to perform HOG Orientation Binning */
-class CLHOGOrientationBinningKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLHOGOrientationBinningKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGOrientationBinningKernel(const CLHOGOrientationBinningKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGOrientationBinningKernel &operator=(const CLHOGOrientationBinningKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHOGOrientationBinningKernel(CLHOGOrientationBinningKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHOGOrientationBinningKernel &operator=(CLHOGOrientationBinningKernel &&) = default;
-    /** Default destructor */
-    ~CLHOGOrientationBinningKernel() = default;
-
-    /**  Initialise the kernel's inputs, output and HOG's metadata
-     *
-     * @param[in]  input_magnitude Input tensor which stores the magnitude of the gradient for each pixel. Data type supported: S16.
-     * @param[in]  input_phase     Input tensor which stores the phase of the gradient for each pixel. Data type supported: U8
-     * @param[out] output          Output tensor which stores the local HOG for each cell. DataType supported: F32. Number of channels supported: equal to the number of histogram bins per cell
-     * @param[in]  hog_info        HOG's metadata
-     */
-    void configure(const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info);
-    /**  Initialise the kernel's inputs, output and HOG's metadata
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input_magnitude Input tensor which stores the magnitude of the gradient for each pixel. Data type supported: S16.
-     * @param[in]  input_phase     Input tensor which stores the phase of the gradient for each pixel. Data type supported: U8
-     * @param[out] output          Output tensor which stores the local HOG for each cell. DataType supported: F32. Number of channels supported: equal to the number of histogram bins per cell
-     * @param[in]  hog_info        HOG's metadata
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input_magnitude;
-    const ICLTensor *_input_phase;
-    ICLTensor       *_output;
-    Size2D           _cell_size;
-};
-
-/** OpenCL kernel to perform HOG block normalization */
-class CLHOGBlockNormalizationKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLHOGBlockNormalizationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGBlockNormalizationKernel(const CLHOGBlockNormalizationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGBlockNormalizationKernel &operator=(const CLHOGBlockNormalizationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHOGBlockNormalizationKernel(CLHOGBlockNormalizationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHOGBlockNormalizationKernel &operator=(CLHOGBlockNormalizationKernel &&) = default;
-    /** Default destructor */
-    ~CLHOGBlockNormalizationKernel() = default;
-
-    /** Initialise the kernel's input, output and HOG's metadata
-     *
-     * @param[in]  input    Input tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
-     * @param[out] output   Output tensor which stores the normalised blocks. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
-     * @param[in]  hog_info HOG's metadata
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info);
-    /** Initialise the kernel's input, output and HOG's metadata
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
-     * @param[out] output          Output tensor which stores the normalised blocks. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
-     * @param[in]  hog_info        HOG's metadata
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    Size2D           _num_cells_per_block_stride;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H */

diff --git a/src/core/CL/kernels/CLHOGDetectorKernel.cpp b/src/core/CL/kernels/CLHOGDetectorKernel.cpp
deleted file mode 100644
index 861155b..0000000
--- a/src/core/CL/kernels/CLHOGDetectorKernel.cpp
+++ /dev/null

@@ -1,146 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLHOGDetectorKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLHOG.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-CLHOGDetectorKernel::CLHOGDetectorKernel()
-    : _input(nullptr), _detection_windows(), _num_detection_windows(nullptr)
-{
-}
-
-void CLHOGDetectorKernel::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows, const Size2D &detection_window_stride,
-                                    float threshold, uint16_t idx_class)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, hog, detection_windows, num_detection_windows, detection_window_stride, threshold, idx_class);
-}
-
-void CLHOGDetectorKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows,
-                                    const Size2D &detection_window_stride,
-                                    float threshold, uint16_t idx_class)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(hog == nullptr);
-    ARM_COMPUTE_ERROR_ON(detection_windows == nullptr);
-    ARM_COMPUTE_ERROR_ON(num_detection_windows == nullptr);
-    ARM_COMPUTE_ERROR_ON((detection_window_stride.width % hog->info()->block_stride().width) != 0);
-    ARM_COMPUTE_ERROR_ON((detection_window_stride.height % hog->info()->block_stride().height) != 0);
-
-    const Size2D &detection_window_size = hog->info()->detection_window_size();
-    const Size2D &block_size            = hog->info()->block_size();
-    const Size2D &block_stride          = hog->info()->block_stride();
-
-    _input                 = input;
-    _detection_windows     = detection_windows;
-    _num_detection_windows = num_detection_windows;
-
-    const unsigned int num_bins_per_descriptor_x   = ((detection_window_size.width - block_size.width) / block_stride.width + 1) * input->info()->num_channels();
-    const unsigned int num_blocks_per_descriptor_y = (detection_window_size.height - block_size.height) / block_stride.height + 1;
-
-    ARM_COMPUTE_ERROR_ON((num_bins_per_descriptor_x * num_blocks_per_descriptor_y + 1) != hog->info()->descriptor_size());
-
-    std::stringstream args_str;
-    args_str << "-DNUM_BLOCKS_PER_DESCRIPTOR_Y=" << num_blocks_per_descriptor_y << " ";
-    args_str << "-DNUM_BINS_PER_DESCRIPTOR_X=" << num_bins_per_descriptor_x << " ";
-    args_str << "-DTHRESHOLD=" << threshold << " ";
-    args_str << "-DMAX_NUM_DETECTION_WINDOWS=" << detection_windows->max_num_values() << " ";
-    args_str << "-DIDX_CLASS=" << idx_class << " ";
-    args_str << "-DDETECTION_WINDOW_WIDTH=" << detection_window_size.width << " ";
-    args_str << "-DDETECTION_WINDOW_HEIGHT=" << detection_window_size.height << " ";
-    args_str << "-DDETECTION_WINDOW_STRIDE_WIDTH=" << detection_window_stride.width << " ";
-    args_str << "-DDETECTION_WINDOW_STRIDE_HEIGHT=" << detection_window_stride.height << " ";
-
-    // Construct kernel name
-    std::set<std::string> build_opts = {};
-    build_opts.insert(args_str.str());
-
-    // Create kernel
-    const std::string kernel_name = std::string("hog_detector");
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts);
-
-    // Set static kernel arguments
-    unsigned int idx = num_arguments_per_2D_tensor(); // Skip the input parameters
-    _kernel.setArg(idx++, hog->cl_buffer());
-    _kernel.setArg(idx++, detection_windows->cl_buffer());
-    _kernel.setArg(idx++, *_num_detection_windows);
-
-    // Get the number of blocks along the x and y directions of the input tensor
-    const ValidRegion &valid_region = input->info()->valid_region();
-    const size_t       num_blocks_x = valid_region.shape[0];
-    const size_t       num_blocks_y = valid_region.shape[1];
-
-    // Get the number of blocks along the x and y directions of the detection window
-    const size_t num_blocks_per_detection_window_x = detection_window_size.width / block_stride.width;
-    const size_t num_blocks_per_detection_window_y = detection_window_size.height / block_stride.height;
-
-    const size_t window_step_x = detection_window_stride.width / block_stride.width;
-    const size_t window_step_y = detection_window_stride.height / block_stride.height;
-
-    // Configure kernel window
-    Window win;
-    win.set(Window::DimX, Window::Dimension(0, floor_to_multiple(num_blocks_x - num_blocks_per_detection_window_x, window_step_x) + window_step_x, window_step_x));
-    win.set(Window::DimY, Window::Dimension(0, floor_to_multiple(num_blocks_y - num_blocks_per_detection_window_y, window_step_y) + window_step_y, window_step_y));
-
-    constexpr unsigned int num_elems_read_per_iteration = 1;
-    const unsigned int     num_rows_read_per_iteration  = num_blocks_per_descriptor_y;
-
-    update_window_and_padding(win, AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration));
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-void CLHOGDetectorKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}

diff --git a/src/core/CL/kernels/CLHOGDetectorKernel.h b/src/core/CL/kernels/CLHOGDetectorKernel.h
deleted file mode 100644
index c28e6eb..0000000
--- a/src/core/CL/kernels/CLHOGDetectorKernel.h
+++ /dev/null

@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHOGDETECTORKERNEL_H
-#define ARM_COMPUTE_CLHOGDETECTORKERNEL_H
-
-#include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/ICLHOG.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace cl
-{
-class Buffer;
-}
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform HOG detector kernel using linear SVM */
-class CLHOGDetectorKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLHOGDetectorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGDetectorKernel(const CLHOGDetectorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGDetectorKernel &operator=(const CLHOGDetectorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHOGDetectorKernel(CLHOGDetectorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHOGDetectorKernel &operator=(CLHOGDetectorKernel &&) = default;
-    /** Default destructor */
-    ~CLHOGDetectorKernel() = default;
-
-    /** Initialise the kernel's input, HOG data-object, detection window, the stride of the detection window, the threshold and index of the object to detect
-     *
-     * @param[in]  input                   Input tensor which stores the HOG descriptor obtained with @ref CLHOGOrientationBinningKernel. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
-     * @param[in]  hog                     HOG data object used by @ref CLHOGOrientationBinningKernel and  @ref CLHOGBlockNormalizationKernel
-     * @param[out] detection_windows       Array of @ref DetectionWindow. This array stores all the detected objects
-     * @param[in]  num_detection_windows   Number of detected objects
-     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
-     *                                     It must be multiple of the hog->info()->block_stride()
-     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
-     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
-     */
-    void configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows, const Size2D &detection_window_stride, float threshold = 0.0f,
-                   uint16_t idx_class = 0);
-    /** Initialise the kernel's input, HOG data-object, detection window, the stride of the detection window, the threshold and index of the object to detect
-     *
-     * @param[in]  compile_context         The compile context to be used.
-     * @param[in]  input                   Input tensor which stores the HOG descriptor obtained with @ref CLHOGOrientationBinningKernel. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
-     * @param[in]  hog                     HOG data object used by @ref CLHOGOrientationBinningKernel and  @ref CLHOGBlockNormalizationKernel
-     * @param[out] detection_windows       Array of @ref DetectionWindow. This array stores all the detected objects
-     * @param[in]  num_detection_windows   Number of detected objects
-     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
-     *                                     It must be multiple of the hog->info()->block_stride()
-     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
-     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows,
-                   const Size2D &detection_window_stride, float threshold = 0.0f,
-                   uint16_t idx_class = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue);
-
-private:
-    const ICLTensor         *_input;
-    ICLDetectionWindowArray *_detection_windows;
-    cl::Buffer              *_num_detection_windows;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLHOGDETECTORKERNEL_H */

diff --git a/src/core/CL/kernels/CLHarrisCornersKernel.cpp b/src/core/CL/kernels/CLHarrisCornersKernel.cpp
deleted file mode 100644
index cbc056f..0000000
--- a/src/core/CL/kernels/CLHarrisCornersKernel.cpp
+++ /dev/null

@@ -1,149 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLHarrisCornersKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <sstream>
-#include <string>
-
-using namespace arm_compute;
-
-CLHarrisScoreKernel::CLHarrisScoreKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr), _sensitivity(), _strength_thresh(), _norm_factor(), _border_size(0)
-{
-}
-
-BorderSize CLHarrisScoreKernel::border_size() const
-{
-    return _border_size;
-}
-
-void CLHarrisScoreKernel::configure(const ICLImage *input1, const ICLImage *input2, ICLImage *output,
-                                    int32_t block_size, float norm_factor, float strength_thresh, float sensitivity,
-                                    bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, block_size, norm_factor, strength_thresh, sensitivity, border_undefined);
-}
-
-void CLHarrisScoreKernel::configure(const CLCompileContext &compile_context, const ICLImage *input1, const ICLImage *input2, ICLImage *output,
-                                    int32_t block_size, float norm_factor, float strength_thresh, float sensitivity,
-                                    bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
-    ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7));
-    ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
-
-    _input1          = input1;
-    _input2          = input2;
-    _output          = output;
-    _sensitivity     = sensitivity;
-    _strength_thresh = strength_thresh;
-    _norm_factor     = norm_factor;
-    _border_size     = BorderSize(block_size / 2);
-
-    // Select kernel
-    std::stringstream harris_score_kernel_name;
-    harris_score_kernel_name << "harris_score_" << block_size << "x" << block_size;
-
-    // Create build options
-    std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())) };
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, harris_score_kernel_name.str(), build_opts);
-
-    // Set static kernel arguments
-    unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg(idx++, sensitivity);
-    _kernel.setArg(idx++, strength_thresh);
-    _kernel.setArg(idx++, norm_factor);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
-    constexpr unsigned int num_elems_written_per_iteration   = 4;
-    const unsigned int     num_elems_read_per_iteration      = block_size == 7 ? 10 : 8;
-    const unsigned int     num_rows_read_per_iteration       = block_size;
-
-    Window win = calculate_max_window(*_input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input1_access(input1->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowRectangle  input2_access(input2->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input1_access, input2_access, output_access);
-
-    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), input2->info()->valid_region());
-    output_access.set_valid_region(win, valid_region, border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = harris_score_kernel_name.str();
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input1->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input1->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input1->info()->dimension(1));
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input2->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input2->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input2->info()->dimension(1));
-}
-
-void CLHarrisScoreKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input1, slice);
-        add_2D_tensor_argument(idx, _input2, slice);
-        add_2D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}

diff --git a/src/core/CL/kernels/CLHarrisCornersKernel.h b/src/core/CL/kernels/CLHarrisCornersKernel.h
deleted file mode 100644
index 6482b0a..0000000
--- a/src/core/CL/kernels/CLHarrisCornersKernel.h
+++ /dev/null

@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHARRISCORNERSKERNEL_H
-#define ARM_COMPUTE_CLHARRISCORNERSKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface for the harris score kernel.
- *
- * @note The implementation supports 3, 5, and 7 for the block_size.
- */
-class CLHarrisScoreKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLHarrisScoreKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHarrisScoreKernel(const CLHarrisScoreKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHarrisScoreKernel &operator=(const CLHarrisScoreKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHarrisScoreKernel(CLHarrisScoreKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHarrisScoreKernel &operator=(CLHarrisScoreKernel &&) = default;
-    /** Default destructor */
-    ~CLHarrisScoreKernel() = default;
-
-    /** Setup the kernel parameters
-     *
-     * @param[in]  input1           Source image (gradient X). Data types supported S16, S32. (Must be the same as input2)
-     * @param[in]  input2           Source image (gradient Y). Data types supported S16, S32. (Must be the same as input1)
-     * @param[out] output           Destination image (harris score). Data types supported F32
-     * @param[in]  block_size       The block window size used to compute the Harris Corner score.  Supports: 3, 5 and 7
-     * @param[in]  norm_factor      Normalization factor to use accordingly with the gradient size (Must be different from 0)
-     * @param[in]  strength_thresh  Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
-     * @param[in]  sensitivity      Sensitivity threshold k from the Harris-Stephens equation.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLImage *input1, const ICLImage *input2, ICLImage *output,
-                   int32_t block_size, float norm_factor, float strength_thresh, float sensitivity,
-                   bool border_undefined);
-    /** Setup the kernel parameters
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input1           Source image (gradient X). Data types supported S16, S32. (Must be the same as input2)
-     * @param[in]  input2           Source image (gradient Y). Data types supported S16, S32. (Must be the same as input1)
-     * @param[out] output           Destination image (harris score). Data types supported F32
-     * @param[in]  block_size       The block window size used to compute the Harris Corner score.  Supports: 3, 5 and 7
-     * @param[in]  norm_factor      Normalization factor to use accordingly with the gradient size (Must be different from 0)
-     * @param[in]  strength_thresh  Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
-     * @param[in]  sensitivity      Sensitivity threshold k from the Harris-Stephens equation.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input1, const ICLImage *input2, ICLImage *output,
-                   int32_t block_size, float norm_factor, float strength_thresh, float sensitivity,
-                   bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-protected:
-    const ICLImage *_input1;          /**< Source image - Gx component */
-    const ICLImage *_input2;          /**< Source image - Gy component */
-    ICLImage       *_output;          /**< Source image - Harris score */
-    float           _sensitivity;     /**< Sensitivity value */
-    float           _strength_thresh; /**< Threshold value */
-    float           _norm_factor;     /**< Normalization factor */
-    BorderSize      _border_size;     /**< Border size */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLHARRISCORNERSKERNEL_H */

diff --git a/src/core/CL/kernels/CLHistogramKernel.cpp b/src/core/CL/kernels/CLHistogramKernel.cpp
deleted file mode 100644
index ca5322a..0000000
--- a/src/core/CL/kernels/CLHistogramKernel.cpp
+++ /dev/null

@@ -1,253 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLHistogramKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLDistribution1D.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <cstring>
-#include <string>
-
-using namespace arm_compute;
-
-// each thread handle 16 pixels
-constexpr signed int pixels_per_item = 16;
-
-// local work group size in X dimension
-constexpr unsigned int local_x_size = 16;
-
-CLHistogramKernel::CLHistogramKernel()
-    : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLHistogramKernel::configure(const ICLImage *input, ICLDistribution1D *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLHistogramKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-
-    // Check input size
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-
-    // Check offset
-    ARM_COMPUTE_ERROR_ON_MSG(0 > output->offset() || output->offset() > 256, "Offset is larger than the image value range.");
-
-    // Check range
-    ARM_COMPUTE_ERROR_ON_MSG(output->range() > 256 /* max range */, "Range larger than the image value range.");
-
-    _input  = input;
-    _output = output;
-
-    if(_input->info()->dimension(0) < pixels_per_item)
-    {
-        return;
-    }
-
-    unsigned int num_bins    = _output->num_bins();
-    unsigned int window_size = _output->window();
-    unsigned int offset      = _output->offset();
-    unsigned int range       = _output->range();
-    unsigned int offrange    = offset + range;
-    unsigned int bin_size    = _output->size();
-    unsigned int buffer_size = bin_size + 1; // We need one extra place for pixels that don't meet the conditions
-
-    // Create kernel
-    bool              is_fixed_size = (256 == num_bins) && (1 == window_size) && (0 == offset) && (256 == offrange);
-    const std::string kernel_name   = is_fixed_size ? "hist_local_kernel_fixed" : "hist_local_kernel";
-    _kernel                         = create_kernel(compile_context, kernel_name);
-
-    // Set static kernel arguments
-    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg(idx++, buffer_size, nullptr);
-    _kernel.setArg(idx++, _output->cl_buffer());
-    if(!is_fixed_size)
-    {
-        _kernel.setArg<cl_uint>(idx++, num_bins);
-        _kernel.setArg<cl_uint>(idx++, offset);
-        _kernel.setArg<cl_uint>(idx++, range);
-        _kernel.setArg<cl_uint>(idx++, offrange);
-    }
-
-    // We only run histogram on Image, therefore only 2 dimensions here
-    unsigned int end_position = (_input->info()->dimension(0) / pixels_per_item) * pixels_per_item;
-
-    // Configure kernel window
-    Window win;
-    win.set(0, Window::Dimension(0, end_position, pixels_per_item));
-    win.set(1, Window::Dimension(0, _input->info()->dimension(1)));
-
-    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, pixels_per_item));
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-void CLHistogramKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-    // TODO (COMPMID-679): Add CLMemFill
-    _output->map(queue, true);
-    ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
-    memset(_output->buffer(), 0, _output->size());
-    _output->unmap(queue);
-
-    if(_input->info()->dimension(0) < pixels_per_item)
-    {
-        return;
-    }
-
-    Window             slice = window.first_slice_window_2D();
-    const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
-    cl::NDRange        lws   = (local_x_size < gws_x) ? cl::NDRange(local_x_size, 1) : cl::NDRange(1, 1);
-
-    do
-    {
-        /* Run the core part which has width can be divided by 16 */
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-
-        enqueue(queue, *this, slice, lws);
-    }
-    while(window.slide_window_slice_2D(slice));
-}
-
-CLHistogramBorderKernel::CLHistogramBorderKernel()
-    : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLHistogramBorderKernel::configure(const ICLImage *input, ICLDistribution1D *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLHistogramBorderKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-
-    // Check input size
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-
-    // Check offset
-    ARM_COMPUTE_ERROR_ON_MSG(0 > output->offset() || output->offset() > 256, "Offset is larger than the image value range.");
-
-    // Check range
-    ARM_COMPUTE_ERROR_ON_MSG(output->range() > 256 /* max range */, "Range larger than the image value range.");
-
-    // We only run histogram on Image, therefore only 2 dimensions here
-    unsigned int start_position = (input->info()->dimension(0) / pixels_per_item) * pixels_per_item;
-
-    if(start_position >= input->info()->dimension(0))
-    {
-        return; // no need to run histogram border kernel
-    }
-
-    _input  = input;
-    _output = output;
-
-    unsigned int num_bins    = _output->num_bins();
-    unsigned int window_size = _output->window();
-    unsigned int offset      = _output->offset();
-    unsigned int range       = _output->range();
-    unsigned int offrange    = offset + range;
-
-    // Create kernel
-    bool              is_fixed_size = (256 == num_bins) && (1 == window_size) && (0 == offset) && (256 == offrange);
-    const std::string kernel_name   = is_fixed_size ? "hist_border_kernel_fixed" : "hist_border_kernel";
-    _kernel                         = create_kernel(compile_context, kernel_name);
-
-    // Set static kernel arguments
-    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg(idx++, _output->cl_buffer());
-    if(!is_fixed_size)
-    {
-        _kernel.setArg<cl_uint>(idx++, num_bins);
-        _kernel.setArg<cl_uint>(idx++, offset);
-        _kernel.setArg<cl_uint>(idx++, range);
-        _kernel.setArg<cl_uint>(idx++, offrange);
-    }
-
-    // Configure kernel window
-    Window win;
-    win.set(0, Window::Dimension(start_position, _input->info()->dimension(0)));
-    win.set(1, Window::Dimension(0, _input->info()->dimension(1)));
-    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, 1));
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-void CLHistogramBorderKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    if(window.x().start() >= window.x().end())
-    {
-        return;
-    }
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-    cl::NDRange lws = cl::NDRange(1, 1);
-
-    Window slice = window.first_slice_window_2D();
-
-    do
-    {
-        /* Run the border part which has width cannot be divided by 16 */
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-
-        enqueue(queue, *this, slice, lws);
-    }
-    while(window.slide_window_slice_2D(slice));
-}

diff --git a/src/core/CL/kernels/CLHistogramKernel.h b/src/core/CL/kernels/CLHistogramKernel.h
deleted file mode 100644
index 9c97c65..0000000
--- a/src/core/CL/kernels/CLHistogramKernel.h
+++ /dev/null

@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHISTOGRAMKERNEL_H
-#define ARM_COMPUTE_CLHISTOGRAMKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLDistribution1D;
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface to run the histogram kernel. This kernel processes the part of image with width can be divided by 16.
- *  If the image width is not a multiple of 16, remaining pixels have to be processed with the @ref CLHistogramBorderKernel
- */
-class CLHistogramKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLHistogramKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHistogramKernel(const CLHistogramKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHistogramKernel &operator=(const CLHistogramKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHistogramKernel(CLHistogramKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHistogramKernel &operator=(CLHistogramKernel &&) = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input  Source image. Data types supported: U8.
-     * @param[out] output Destination distribution.
-     */
-    void configure(const ICLImage *input, ICLDistribution1D *output);
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source image. Data types supported: U8.
-     * @param[out] output          Destination distribution.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLImage    *_input;
-    ICLDistribution1D *_output;
-};
-
-/** Interface to run the histogram kernel to handle the leftover part of image
- *
- */
-class CLHistogramBorderKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLHistogramBorderKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHistogramBorderKernel(const CLHistogramBorderKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHistogramBorderKernel &operator=(const CLHistogramBorderKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHistogramBorderKernel(CLHistogramBorderKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHistogramBorderKernel &operator=(CLHistogramBorderKernel &&) = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input  Source image. Data types supported: U8.
-     * @param[out] output Destination distribution.
-     */
-    void configure(const ICLImage *input, ICLDistribution1D *output);
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source image. Data types supported: U8.
-     * @param[out] output          Destination distribution.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLImage    *_input;
-    ICLDistribution1D *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLHISTOGRAMKERNEL_H*/

diff --git a/src/core/CL/kernels/CLIntegralImageKernel.cpp b/src/core/CL/kernels/CLIntegralImageKernel.cpp
deleted file mode 100644
index 5e5683d..0000000
--- a/src/core/CL/kernels/CLIntegralImageKernel.cpp
+++ /dev/null

@@ -1,146 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLIntegralImageKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-
-using namespace arm_compute;
-
-void CLIntegralImageHorKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLIntegralImageHorKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32);
-
-    _input  = input;
-    _output = output;
-
-    // Create kernel
-    const std::string kernel_name = std::string("integral_horizontal");
-    _kernel                       = create_kernel(compile_context, kernel_name);
-
-    // Configure kernel window
-    const unsigned int num_elems_processed_per_iteration = input->info()->dimension(0);
-    const unsigned int num_elems_accessed_per_iteration  = ceil_to_multiple(num_elems_processed_per_iteration, 16);
-
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_accessed_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input->info(), 0, num_elems_accessed_per_iteration),
-                              output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-CLIntegralImageVertKernel::CLIntegralImageVertKernel()
-    : _in_out(nullptr)
-{
-}
-
-void CLIntegralImageVertKernel::configure(ICLTensor *in_out)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), in_out);
-}
-
-void CLIntegralImageVertKernel::configure(const CLCompileContext &compile_context, ICLTensor *in_out)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(in_out, 1, DataType::U32);
-
-    _in_out = in_out;
-
-    // Create kernel
-    const std::string kernel_name = std::string("integral_vertical");
-    _kernel                       = create_kernel(compile_context, kernel_name);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration_x = 8;
-    const unsigned int     num_elems_processed_per_iteration_y = in_out->info()->dimension(Window::DimY);
-
-    Window win = calculate_max_window(*in_out->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    AccessWindowRectangle in_out_access(in_out->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-
-    update_window_and_padding(win, in_out_access);
-
-    in_out_access.set_valid_region(win, in_out->info()->valid_region());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(in_out->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(in_out->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(in_out->info()->dimension(1));
-}
-
-void CLIntegralImageVertKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const size_t height = _in_out->info()->dimension(1);
-
-    Window slice = window.first_slice_window_2D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _in_out, slice);
-        _kernel.setArg<cl_uint>(idx++, height);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}

diff --git a/src/core/CL/kernels/CLIntegralImageKernel.h b/src/core/CL/kernels/CLIntegralImageKernel.h
deleted file mode 100644
index 0e40e3a..0000000
--- a/src/core/CL/kernels/CLIntegralImageKernel.h
+++ /dev/null

@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLINTEGRALIMAGEKERNEL_H
-#define ARM_COMPUTE_CLINTEGRALIMAGEKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface to run the horizontal pass of the integral image kernel. */
-class CLIntegralImageHorKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  An input tensor. Data types supported: U8
-     * @param[out] output Destination tensor, Data types supported: U32.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           An input tensor. Data types supported: U8
-     * @param[out] output          Destination tensor, Data types supported: U32.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-};
-
-/** Interface to run the vertical pass of the integral image kernel. */
-class CLIntegralImageVertKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLIntegralImageVertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLIntegralImageVertKernel(const CLIntegralImageVertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLIntegralImageVertKernel &operator=(const CLIntegralImageVertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLIntegralImageVertKernel(CLIntegralImageVertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLIntegralImageVertKernel &operator=(CLIntegralImageVertKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in,out] in_out The input/output tensor. Data types supported: U32
-     */
-    void configure(ICLTensor *in_out);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in,out] in_out          The input/output tensor. Data types supported: U32
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *in_out);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    ICLTensor *_in_out;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLINTEGRALIMAGEKERNEL_H */

diff --git a/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp b/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp
deleted file mode 100644
index 9845dd6..0000000
--- a/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp
+++ /dev/null

@@ -1,176 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLMagnitudePhaseKernel::CLMagnitudePhaseKernel()
-    : _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr), _run_mag(false), _run_phase(false)
-{
-}
-
-void CLMagnitudePhaseKernel::configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase,
-                                       MagnitudeType mag_type, PhaseType phase_type)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), gx, gy, magnitude, phase, mag_type, phase_type);
-}
-
-void CLMagnitudePhaseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase,
-                                       MagnitudeType mag_type, PhaseType phase_type)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON((magnitude == nullptr) && (phase == nullptr));
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, gy);
-
-    _run_mag   = (magnitude != nullptr);
-    _run_phase = (phase != nullptr);
-    if(_run_mag)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::S16, DataType::S32);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, magnitude);
-    }
-    if(_run_phase)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
-    }
-
-    if(!_run_mag && !_run_phase)
-    {
-        ARM_COMPUTE_ERROR("At least one output must be NOT NULL");
-    }
-
-    _gx        = gx;
-    _gy        = gy;
-    _magnitude = magnitude;
-    _phase     = phase;
-
-    // Construct kernel name
-    std::set<std::string> build_opts = {};
-
-    // Add magnitude type
-    if(_run_mag)
-    {
-        switch(mag_type)
-        {
-            case MagnitudeType::L1NORM:
-                build_opts.insert("-DMAGNITUDE=1");
-                break;
-            case MagnitudeType::L2NORM:
-                build_opts.insert("-DMAGNITUDE=2");
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Unsupported magnitude calculation type.");
-                build_opts.insert("-DMAGNITUDE=0");
-                break;
-        }
-    }
-
-    // Add phase type
-    if(_run_phase)
-    {
-        switch(phase_type)
-        {
-            case PhaseType::UNSIGNED:
-                build_opts.insert("-DPHASE=1");
-                break;
-            case PhaseType::SIGNED:
-                build_opts.insert("-DPHASE=2");
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Unsupported phase calculation type.");
-                build_opts.insert("-DPHASE=0");
-                break;
-        }
-    }
-
-    // Add data_type
-    build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(gx->info()->data_type()));
-
-    // Create kernel
-    const std::string kernel_name = std::string("magnitude_phase");
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-    Window win = calculate_max_window(*gx->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal gx_access(gx->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal gy_access(gy->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_magnitude_access(magnitude == nullptr ? nullptr : magnitude->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_phase_access(phase == nullptr ? nullptr : phase->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win,
-                              gx_access, gy_access,
-                              output_magnitude_access, output_phase_access);
-
-    ValidRegion valid_region = intersect_valid_regions(gx->info()->valid_region(),
-                                                       gy->info()->valid_region());
-    output_magnitude_access.set_valid_region(win, valid_region);
-    output_phase_access.set_valid_region(win, valid_region);
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(gx->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gx->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gx->info()->dimension(1));
-}
-
-void CLMagnitudePhaseKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _gx, slice);
-        add_2D_tensor_argument(idx, _gy, slice);
-        add_2D_tensor_argument_if((_run_mag), idx, _magnitude, slice);
-        add_2D_tensor_argument_if((_run_phase), idx, _phase, slice);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}

diff --git a/src/core/CL/kernels/CLMagnitudePhaseKernel.h b/src/core/CL/kernels/CLMagnitudePhaseKernel.h
deleted file mode 100644
index 514036b..0000000
--- a/src/core/CL/kernels/CLMagnitudePhaseKernel.h
+++ /dev/null

@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMAGNITUDEPHASEKERNEL_H
-#define ARM_COMPUTE_CLMAGNITUDEPHASEKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Template interface for the kernel to compute magnitude and phase.
- *
- */
-class CLMagnitudePhaseKernel : public ICLKernel
-{
-public:
-    /** Default constructor. */
-    CLMagnitudePhaseKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMagnitudePhaseKernel(const CLMagnitudePhaseKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMagnitudePhaseKernel &operator=(const CLMagnitudePhaseKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMagnitudePhaseKernel(CLMagnitudePhaseKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMagnitudePhaseKernel &operator=(CLMagnitudePhaseKernel &&) = default;
-    /** Initialise the kernel's input, output.
-     *
-     * @note At least one of output1 or output2 must be set.
-     *
-     * @param[in]  gx         The input gradient X tensor. Data types supported: S16/S32.
-     * @param[in]  gy         The input gradient Y tensor. Data types supported: S16/S32.
-     * @param[out] magnitude  (Optional) The output tensor - Magnitude. Data types supported: S16/S32.
-     * @param[out] phase      (Optional) The output tensor - Phase. Data types supported: U8.
-     * @param[in]  mag_type   (Optional) Magnitude calculation type. Default: L2NORM.
-     * @param[in]  phase_type (Optional) Phase calculation type. Default: SIGNED.
-     */
-    void configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase,
-                   MagnitudeType mag_type = MagnitudeType::L2NORM, PhaseType phase_type = PhaseType::SIGNED);
-    /** Initialise the kernel's input, output.
-     *
-     * @note At least one of output1 or output2 must be set.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  gx              The input gradient X tensor. Data types supported: S16/S32.
-     * @param[in]  gy              The input gradient Y tensor. Data types supported: S16/S32.
-     * @param[out] magnitude       (Optional) The output tensor - Magnitude. Data types supported: S16/S32.
-     * @param[out] phase           (Optional) The output tensor - Phase. Data types supported: U8.
-     * @param[in]  mag_type        (Optional) Magnitude calculation type. Default: L2NORM.
-     * @param[in]  phase_type      (Optional) Phase calculation type. Default: SIGNED.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase,
-                   MagnitudeType mag_type = MagnitudeType::L2NORM, PhaseType phase_type = PhaseType::SIGNED);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_gx;        /**< Input gradient X. */
-    const ICLTensor *_gy;        /**< Input gradient Y. */
-    ICLTensor       *_magnitude; /**< Output - Magnitude. */
-    ICLTensor       *_phase;     /**< Output - Phase. */
-    bool             _run_mag;   /**< Calculate magnitude ? */
-    bool             _run_phase; /**< Calculate phase ? */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLMAGNITUDEPHASEKERNEL_H */

diff --git a/src/core/CL/kernels/CLMeanStdDevKernel.cpp b/src/core/CL/kernels/CLMeanStdDevKernel.cpp
deleted file mode 100644
index aed6e6e..0000000
--- a/src/core/CL/kernels/CLMeanStdDevKernel.cpp
+++ /dev/null

@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLMeanStdDevKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <cmath>
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLMeanStdDevKernel::CLMeanStdDevKernel()
-    : _input(nullptr), _mean(nullptr), _stddev(nullptr), _global_sum(nullptr), _global_sum_squared(nullptr), _border_size(0)
-{
-}
-
-BorderSize CLMeanStdDevKernel::border_size() const
-{
-    return _border_size;
-}
-
-Status CLMeanStdDevKernel::validate(const ITensorInfo *input, float *mean, cl::Buffer *global_sum, float *stddev, cl::Buffer *global_sum_squared)
-{
-    ARM_COMPUTE_UNUSED(mean);
-    ARM_COMPUTE_UNUSED(stddev);
-    ARM_COMPUTE_UNUSED(global_sum);
-    ARM_COMPUTE_UNUSED(global_sum_squared);
-    ARM_COMPUTE_RETURN_ERROR_ON_INT64_BASE_ATOMICS_UNSUPPORTED();
-    ARM_COMPUTE_RETURN_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-
-    return Status{};
-}
-
-void CLMeanStdDevKernel::configure(const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev, cl::Buffer *global_sum_squared)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, mean, global_sum, stddev, global_sum_squared);
-}
-
-void CLMeanStdDevKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev, cl::Buffer *global_sum_squared)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, global_sum);
-    ARM_COMPUTE_ERROR_ON(stddev && nullptr == global_sum_squared);
-    ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevKernel::validate(input->info(), mean, global_sum, stddev, global_sum_squared));
-
-    _input              = input;
-    _mean               = mean;
-    _stddev             = stddev;
-    _global_sum         = global_sum;
-    _global_sum_squared = global_sum_squared;
-
-    // Create kernel
-    std::set<std::string> build_opts;
-
-    if(_stddev != nullptr)
-    {
-        build_opts.insert("-DSTDDEV");
-    }
-
-    _kernel = create_kernel(compile_context, "mean_stddev_accumulate", build_opts);
-
-    // Set fixed arguments
-    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input parameters
-
-    _kernel.setArg(idx++, static_cast<cl_uint>(input->info()->dimension(1)));
-    _kernel.setArg(idx++, *_global_sum);
-
-    if(_stddev != nullptr)
-    {
-        _kernel.setArg(idx++, *_global_sum_squared);
-    }
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration_x = 8;
-    const unsigned int     num_elems_processed_per_iteration_y = input->info()->dimension(1);
-
-    _border_size = BorderSize(ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration_x) - input->info()->dimension(0));
-
-    Window                win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-    update_window_and_padding(win, input_access);
-
-    ICLKernel::configure_internal(win);
-}
-
-void CLMeanStdDevKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    // Clear sums
-    static const cl_ulong zero = 0;
-    queue.enqueueWriteBuffer(*_global_sum, CL_FALSE, 0, sizeof(cl_ulong), &zero);
-
-    if(_stddev != nullptr)
-    {
-        queue.enqueueWriteBuffer(*_global_sum_squared, CL_FALSE, 0, sizeof(cl_ulong), &zero);
-    }
-
-    Window slice = window.first_slice_window_2D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        // Set slice step equal to height to force gws[1] to 1,
-        // as each thread calculates the sum across all rows and columns equal to the number of elements processed by each work-item
-        slice.set_dimension_step(Window::DimY, _input->info()->dimension(1));
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-
-    // Calculate mean and stddev
-    cl_ulong    global_sum         = 0;
-    cl_ulong    global_sum_squared = 0;
-    const float num_pixels         = _input->info()->dimension(0) * _input->info()->dimension(1);
-
-    queue.enqueueReadBuffer(*_global_sum, CL_TRUE, 0, sizeof(cl_ulong), static_cast<void *>(&global_sum));
-    const float mean = global_sum / num_pixels;
-    *_mean           = mean;
-
-    if(_stddev != nullptr)
-    {
-        queue.enqueueReadBuffer(*_global_sum_squared, CL_TRUE, 0, sizeof(cl_ulong), static_cast<void *>(&global_sum_squared));
-        *_stddev = std::sqrt((global_sum_squared / num_pixels) - (mean * mean));
-    }
-}

diff --git a/src/core/CL/kernels/CLMeanStdDevKernel.h b/src/core/CL/kernels/CLMeanStdDevKernel.h
deleted file mode 100644
index 179a202..0000000
--- a/src/core/CL/kernels/CLMeanStdDevKernel.h
+++ /dev/null

@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMEANSTDDEVKERNEL_H
-#define ARM_COMPUTE_CLMEANSTDDEVKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace cl
-{
-class Buffer;
-}
-
-namespace arm_compute
-{
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface for the kernel to calculate mean and standard deviation of input image pixels. */
-class CLMeanStdDevKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLMeanStdDevKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMeanStdDevKernel(const CLMeanStdDevKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMeanStdDevKernel &operator=(const CLMeanStdDevKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMeanStdDevKernel(CLMeanStdDevKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMeanStdDevKernel &operator=(CLMeanStdDevKernel &&) = default;
-    /** Initialise the kernel's input and outputs.
-     *
-     * @param[in]  input              Input image. Data types supported: U8.
-     * @param[out] mean               Input average pixel value.
-     * @param[out] global_sum         Keeps global sum of pixel values (Buffer size: 1 cl_ulong).
-     * @param[out] stddev             (Optional) Output standard deviation of pixel values.
-     * @param[out] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values (Buffer size: 1 cl_ulong).
-     */
-    void configure(const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev = nullptr, cl::Buffer *global_sum_squared = nullptr);
-    /** Initialise the kernel's input and outputs.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  input              Input image. Data types supported: U8.
-     * @param[out] mean               Input average pixel value.
-     * @param[out] global_sum         Keeps global sum of pixel values (Buffer size: 1 cl_ulong).
-     * @param[out] stddev             (Optional) Output standard deviation of pixel values.
-     * @param[out] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values (Buffer size: 1 cl_ulong).
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev = nullptr, cl::Buffer *global_sum_squared = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDevKernel.
-     *
-     * @param[in] input              Input image info. Data types supported: U8.
-     * @param[in] mean               Input average pixel value.
-     * @param[in] global_sum         Keeps global sum of pixel values.
-     * @param[in] stddev             (Optional) Output standard deviation of pixel values.
-     * @param[in] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, float *mean, cl::Buffer *global_sum, float *stddev = nullptr, cl::Buffer *global_sum_squared = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-    BorderSize border_size() const override;
-
-private:
-    const ICLImage *_input;
-    float          *_mean;
-    float          *_stddev;
-    cl::Buffer     *_global_sum;
-    cl::Buffer     *_global_sum_squared;
-    BorderSize      _border_size;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLMEANSTDDEVKERNEL_H */

diff --git a/src/core/CL/kernels/CLMedian3x3Kernel.cpp b/src/core/CL/kernels/CLMedian3x3Kernel.cpp
deleted file mode 100644
index 23a21d6..0000000
--- a/src/core/CL/kernels/CLMedian3x3Kernel.cpp
+++ /dev/null

@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLMedian3x3Kernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-BorderSize CLMedian3x3Kernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void CLMedian3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined);
-}
-
-void CLMedian3x3Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
-    _input  = input;
-    _output = output;
-
-    // Create kernel
-    const std::string kernel_name = std::string("non_linear_filter_box3x3");
-    _kernel                       = create_kernel(compile_context, kernel_name, { "-DMEDIAN" });
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_undefined);
-}

diff --git a/src/core/CL/kernels/CLMedian3x3Kernel.h b/src/core/CL/kernels/CLMedian3x3Kernel.h
deleted file mode 100644
index 8cc5ed7..0000000
--- a/src/core/CL/kernels/CLMedian3x3Kernel.h
+++ /dev/null

@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMEDIAN3X3KERNEL_H
-#define ARM_COMPUTE_CLMEDIAN3X3KERNEL_H
-
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the median 3x3 filter kernel.
- *
- */
-class CLMedian3x3Kernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLMEDIAN3X3KERNEL_H */

diff --git a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
deleted file mode 100644
index 675cfc1..0000000
--- a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
+++ /dev/null

@@ -1,246 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLMinMaxLocationKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <climits>
-
-namespace arm_compute
-{
-inline int32_t FloatFlip(float val)
-{
-    static_assert(sizeof(float) == sizeof(int32_t), "Float must be same size as int32_t");
-    int32_t int_val = 0;
-
-    memcpy(&int_val, &val, sizeof(float));
-    int_val = (int_val >= 0) ? int_val : int_val ^ 0x7FFFFFFF;
-    return int_val;
-}
-
-inline float IFloatFlip(int32_t val)
-{
-    static_assert(sizeof(float) == sizeof(int32_t), "Float must be same size as int32_t");
-    float flt_val = 0.f;
-
-    val = (val >= 0) ? val : val ^ 0x7FFFFFFF;
-    memcpy(&flt_val, &val, sizeof(float));
-    return flt_val;
-}
-
-CLMinMaxKernel::CLMinMaxKernel()
-    : _input(nullptr), _min_max(), _data_type_max_min()
-{
-}
-
-void CLMinMaxKernel::configure(const ICLImage *input, cl::Buffer *min_max)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, min_max);
-}
-
-void CLMinMaxKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, cl::Buffer *min_max)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON(min_max == nullptr);
-
-    _input                                               = input;
-    _min_max                                             = min_max;
-    const unsigned int num_elems_processed_per_iteration = input->info()->dimension(0);
-
-    switch(input->info()->data_type())
-    {
-        case DataType::U8:
-            _data_type_max_min[0] = UCHAR_MAX;
-            _data_type_max_min[1] = 0;
-            break;
-        case DataType::S16:
-            _data_type_max_min[0] = SHRT_MAX;
-            _data_type_max_min[1] = SHRT_MIN;
-            break;
-        case DataType::F32:
-            _data_type_max_min[0] = FloatFlip(std::numeric_limits<float>::max());
-            _data_type_max_min[1] = FloatFlip(std::numeric_limits<float>::lowest());
-            break;
-        default:
-            ARM_COMPUTE_ERROR("You called with the wrong image data types");
-    }
-
-    // Set kernel build options
-    std::set<std::string> build_opts{ "-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()) };
-
-    if(num_elems_processed_per_iteration % max_cl_vector_width != 0)
-    {
-        build_opts.emplace("-DNON_MULTIPLE_OF_16");
-    }
-
-    if(input->info()->data_type() == DataType::F32)
-    {
-        build_opts.emplace("-DDATA_TYPE_MAX=" + support::cpp11::to_string(std::numeric_limits<float>::max()));
-        build_opts.emplace("-DDATA_TYPE_MIN=" + support::cpp11::to_string(std::numeric_limits<float>::lowest()));
-        build_opts.emplace("-DIS_DATA_TYPE_FLOAT");
-    }
-    else
-    {
-        build_opts.emplace("-DDATA_TYPE_MAX=" + support::cpp11::to_string(_data_type_max_min[0]));
-        build_opts.emplace("-DDATA_TYPE_MIN=" + support::cpp11::to_string(_data_type_max_min[1]));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "minmax", build_opts);
-
-    // Set fixed arguments
-    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg(idx++, *_min_max);
-    _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(input->info()->dimension(0)));
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, ceil_to_multiple(num_elems_processed_per_iteration, 16)));
-    ICLKernel::configure_internal(win);
-}
-
-void CLMinMaxKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    // Reset mininum and maximum values
-    queue.enqueueWriteBuffer(*_min_max, CL_FALSE /* blocking */, 0, _data_type_max_min.size() * sizeof(int), _data_type_max_min.data());
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-
-    cl_int min = 0;
-    cl_int max = 0;
-    queue.enqueueReadBuffer(*_min_max, CL_TRUE /* blocking */, 0 * sizeof(cl_int), sizeof(cl_int), static_cast<int *>(&min));
-    queue.enqueueReadBuffer(*_min_max, CL_TRUE /* blocking */, 1 * sizeof(cl_int), sizeof(cl_int), static_cast<int *>(&max));
-
-    if(_input->info()->data_type() == DataType::F32)
-    {
-        std::array<float, 2> min_max =
-        {
-            {
-                IFloatFlip(min),
-                IFloatFlip(max)
-            }
-        };
-        queue.enqueueWriteBuffer(*_min_max, CL_TRUE /* blocking */, 0, min_max.size() * sizeof(float), min_max.data());
-    }
-    else
-    {
-        std::array<int32_t, 2> min_max = { { min, max } };
-        queue.enqueueWriteBuffer(*_min_max, CL_TRUE /* blocking */, 0, min_max.size() * sizeof(int32_t), min_max.data());
-    }
-}
-
-CLMinMaxLocationKernel::CLMinMaxLocationKernel()
-    : _input(nullptr), _min_max_count(nullptr)
-{
-}
-
-void CLMinMaxLocationKernel::configure(const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count, ICLCoordinates2DArray *min_loc, ICLCoordinates2DArray *max_loc)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, min_max, min_max_count, min_loc, max_loc);
-}
-
-void CLMinMaxLocationKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count, ICLCoordinates2DArray *min_loc,
-                                       ICLCoordinates2DArray *max_loc)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON(min_max == nullptr);
-    ARM_COMPUTE_ERROR_ON(min_max_count == nullptr && min_loc == nullptr && max_loc == nullptr);
-
-    _input         = input;
-    _min_max_count = min_max_count;
-
-    // Set kernel build options
-    std::set<std::string> build_opts;
-    build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.emplace((min_max_count != nullptr) ? "-DCOUNT_MIN_MAX" : "");
-    build_opts.emplace((min_loc != nullptr) ? "-DLOCATE_MIN" : "");
-    build_opts.emplace((max_loc != nullptr) ? "-DLOCATE_MAX" : "");
-    if(input->info()->data_type() == DataType::F32)
-    {
-        build_opts.emplace("-DIS_DATA_TYPE_FLOAT");
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "minmaxloc", build_opts);
-
-    // Set static arguments
-    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg(idx++, *min_max);
-    _kernel.setArg(idx++, *min_max_count);
-    if(min_loc != nullptr)
-    {
-        _kernel.setArg(idx++, min_loc->cl_buffer());
-        _kernel.setArg<cl_uint>(idx++, min_loc->max_num_values());
-    }
-    if(max_loc != nullptr)
-    {
-        _kernel.setArg(idx++, max_loc->cl_buffer());
-        _kernel.setArg<cl_uint>(idx++, max_loc->max_num_values());
-    }
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-    Window                 win                               = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-}
-
-void CLMinMaxLocationKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    static const unsigned int zero_count = 0;
-    queue.enqueueWriteBuffer(*_min_max_count, CL_FALSE, 0 * sizeof(zero_count), sizeof(zero_count), &zero_count);
-    queue.enqueueWriteBuffer(*_min_max_count, CL_FALSE, 1 * sizeof(zero_count), sizeof(zero_count), &zero_count);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
-} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLMinMaxLocationKernel.h b/src/core/CL/kernels/CLMinMaxLocationKernel.h
deleted file mode 100644
index 2196abe..0000000
--- a/src/core/CL/kernels/CLMinMaxLocationKernel.h
+++ /dev/null

@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMINMAXLOCATIONKERNEL_H
-#define ARM_COMPUTE_CLMINMAXLOCATIONKERNEL_H
-
-#include "arm_compute/core/CL/ICLArray.h"
-#include "src/core/CL/ICLKernel.h"
-
-#include <array>
-
-namespace arm_compute
-{
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface for the kernel to perform min max search on an image.
- */
-class CLMinMaxKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLMinMaxKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxKernel(const CLMinMaxKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxKernel &operator=(const CLMinMaxKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMinMaxKernel(CLMinMaxKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMinMaxKernel &operator=(CLMinMaxKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input   Input Image. Data types supported: U8/S16/F32.
-     * @param[out] min_max Buffer of 2 elements to store the min value at position 0 and the max value at position 1. Data type supported: S32 if input type is U8/S16, F32 if input type is F32.
-     */
-    void configure(const ICLImage *input, cl::Buffer *min_max);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input Image. Data types supported: U8/S16/F32.
-     * @param[out] min_max         Buffer of 2 elements to store the min value at position 0 and the max value at position 1. Data type supported: S32 if input type is U8/S16, F32 if input type is F32.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, cl::Buffer *min_max);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;               /**< Input image. */
-    cl::Buffer      *_min_max;             /**< Minimum/maximum value. */
-    std::array<int, 2> _data_type_max_min; /**< Maximum and minimum data type value respectively. */
-};
-
-/** Interface for the kernel to find min max locations of an image.
- */
-class CLMinMaxLocationKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLMinMaxLocationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxLocationKernel(const CLMinMaxLocationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxLocationKernel &operator=(const CLMinMaxLocationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMinMaxLocationKernel(CLMinMaxLocationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMinMaxLocationKernel &operator=(CLMinMaxLocationKernel &&) = default;
-    /** Initialise the kernel's input and outputs.
-     *
-     * @note When locations of min and max occurrences are requested, the reported number of locations is limited to the given array size.
-     *
-     * @param[in]  input         Input image. Data types supported: U8/S16/F32.
-     * @param[out] min_max       Buffer of 2 elements to store the min value at position 0 and the max value at position 1. Data type supported: S32 if input type is U8/S16, F32 if input type is F32.
-     * @param[out] min_max_count Buffer of 2 elements to store the min value occurrences at position 0 and the max value occurrences at position 1. Data type supported: S32
-     * @param[out] min_loc       (Optional) Array of Coordinates2D used to store minimum value locations.
-     * @param[out] max_loc       (Optional) Array of Coordinates2D used to store maximum value locations.
-     */
-    void configure(const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count,
-                   ICLCoordinates2DArray *min_loc = nullptr, ICLCoordinates2DArray *max_loc = nullptr);
-    /** Initialise the kernel's input and outputs.
-     *
-     * @note When locations of min and max occurrences are requested, the reported number of locations is limited to the given array size.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input image. Data types supported: U8/S16/F32.
-     * @param[out] min_max         Buffer of 2 elements to store the min value at position 0 and the max value at position 1. Data type supported: S32 if input type is U8/S16, F32 if input type is F32.
-     * @param[out] min_max_count   Buffer of 2 elements to store the min value occurrences at position 0 and the max value occurrences at position 1. Data type supported: S32
-     * @param[out] min_loc         (Optional) Array of Coordinates2D used to store minimum value locations.
-     * @param[out] max_loc         (Optional) Array of Coordinates2D used to store maximum value locations.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count,
-                   ICLCoordinates2DArray *min_loc = nullptr, ICLCoordinates2DArray *max_loc = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLImage *_input;         /**< Input image. */
-    cl::Buffer     *_min_max_count; /**< Minimum/maximum value occurrences. */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLMINMAXLOCATIONKERNEL_H */

diff --git a/src/core/CL/kernels/CLNonLinearFilterKernel.cpp b/src/core/CL/kernels/CLNonLinearFilterKernel.cpp
deleted file mode 100644
index c73acaf..0000000
--- a/src/core/CL/kernels/CLNonLinearFilterKernel.cpp
+++ /dev/null

@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLNonLinearFilterKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <algorithm>
-#include <cmath>
-#include <cstdlib>
-#include <set>
-#include <sstream>
-#include <string>
-
-using namespace arm_compute;
-
-CLNonLinearFilterKernel::CLNonLinearFilterKernel()
-    : _border_size(0)
-{
-}
-
-BorderSize CLNonLinearFilterKernel::border_size() const
-{
-    return _border_size;
-}
-
-void CLNonLinearFilterKernel::configure(const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function,
-                                        unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
-                                        bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, function, mask_size, pattern, mask, border_undefined);
-}
-
-void CLNonLinearFilterKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function,
-                                        unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
-                                        bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(mask_size != 3 && mask_size != 5);
-    ARM_COMPUTE_ERROR_ON_MSG(pattern == MatrixPattern::OTHER, "MatrixPattern::OTHER is not supported!");
-    ARM_COMPUTE_UNUSED(mask);
-
-    _input       = input;
-    _output      = output;
-    _border_size = BorderSize(mask_size / 2);
-
-    // Define build options
-    std::set<std::string> build_opts;
-    build_opts.emplace("-D" + string_from_non_linear_filter_function(function));
-
-    // Define kernel
-    std::string pattern_name = string_from_matrix_pattern(pattern);
-    std::transform(pattern_name.begin(), pattern_name.end(), pattern_name.begin(), ::tolower);
-    std::stringstream ss;
-    ss << "non_linear_filter_" << pattern_name << mask_size << "x" << mask_size;
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, ss.str(), build_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    const unsigned int     num_rows_read_per_iteration       = mask_size;
-
-    Window win = calculate_max_window(*input->info(), num_elems_processed_per_iteration, border_undefined, border_size());
-
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-}

diff --git a/src/core/CL/kernels/CLNonLinearFilterKernel.h b/src/core/CL/kernels/CLNonLinearFilterKernel.h
deleted file mode 100644
index ed42063..0000000
--- a/src/core/CL/kernels/CLNonLinearFilterKernel.h
+++ /dev/null

@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLNONLINEARFILTERKERNEL_H
-#define ARM_COMPUTE_CLNONLINEARFILTERKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to apply a non-linear filter */
-class CLNonLinearFilterKernel : public ICLSimple2DKernel
-{
-public:
-    /** Default constructor */
-    CLNonLinearFilterKernel();
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8
-     * @param[out] output           Destination tensor. Data types supported: U8
-     * @param[in]  function         Non linear function to perform
-     * @param[in]  mask_size        Mask size. Supported sizes: 3, 5
-     * @param[in]  pattern          Mask pattern
-     * @param[in]  mask             The given mask. Will be used only if pattern is specified to PATTERN_OTHER
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function,
-                   unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
-                   bool border_undefined);
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8
-     * @param[out] output           Destination tensor. Data types supported: U8
-     * @param[in]  function         Non linear function to perform
-     * @param[in]  mask_size        Mask size. Supported sizes: 3, 5
-     * @param[in]  pattern          Mask pattern
-     * @param[in]  mask             The given mask. Will be used only if pattern is specified to PATTERN_OTHER
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function,
-                   unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
-                   bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-
-private:
-    BorderSize _border_size; /**< Border size */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLNONLINEARFILTERKERNEL_H */

diff --git a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp
deleted file mode 100644
index 7d5c5ba..0000000
--- a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp
+++ /dev/null

@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-BorderSize CLNonMaximaSuppression3x3Kernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void CLNonMaximaSuppression3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined);
-}
-
-void CLNonMaximaSuppression3x3Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F32);
-
-    _input  = input;
-    _output = output;
-
-    // Create kernel
-    std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) };
-    _kernel                          = create_kernel(compile_context, "non_max_suppression", build_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-}

diff --git a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h
deleted file mode 100644
index d9ed60c..0000000
--- a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h
+++ /dev/null

@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLNONMAXIMASUPPRESSION3x3KERNEL_H
-#define ARM_COMPUTE_CLNONMAXIMASUPPRESSION3x3KERNEL_H
-
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface to perform Non-Maxima suppression over a 3x3 window using OpenCL
- *
- * @note Used by @ref CLFastCorners and @ref CLHarrisCorners
- */
-class CLNonMaximaSuppression3x3Kernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's sources, destinations and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8, F32. (Must be the same as the output tensor)
-     * @param[out] output           Destination tensor. Data types supported: U8, F32. (Must be the same as the input tensor)
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /** Initialise the kernel's sources, destinations and border mode.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8, F32. (Must be the same as the output tensor)
-     * @param[out] output           Destination tensor. Data types supported: U8, F32. (Must be the same as the input tensor)
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLNONMAXIMASUPPRESSION3x3KERNEL_H */

diff --git a/src/core/CL/kernels/CLScharr3x3Kernel.cpp b/src/core/CL/kernels/CLScharr3x3Kernel.cpp
deleted file mode 100644
index 7ceddc9..0000000
--- a/src/core/CL/kernels/CLScharr3x3Kernel.cpp
+++ /dev/null

@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLScharr3x3Kernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLScharr3x3Kernel::CLScharr3x3Kernel()
-    : _run_scharr_x(false), _run_scharr_y(false), _input(nullptr), _output_x(nullptr), _output_y(nullptr)
-{
-}
-
-BorderSize CLScharr3x3Kernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void CLScharr3x3Kernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_undefined);
-}
-
-void CLScharr3x3Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
-    _run_scharr_x = output_x != nullptr;
-    _run_scharr_y = output_y != nullptr;
-
-    if(_run_scharr_x)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
-    }
-
-    if(_run_scharr_y)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
-    }
-
-    _input    = input;
-    _output_x = output_x;
-    _output_y = output_y;
-
-    // Set build options
-    std::set<std::string> build_opts;
-
-    if(_run_scharr_x)
-    {
-        build_opts.insert("-DGRAD_X");
-    }
-
-    if(_run_scharr_y)
-    {
-        build_opts.insert("-DGRAD_Y");
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "scharr3x3", build_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
-    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_access, output_x_access, output_y_access);
-
-    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-}
-
-void CLScharr3x3Kernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        add_2D_tensor_argument_if((_run_scharr_x), idx, _output_x, slice);
-        add_2D_tensor_argument_if((_run_scharr_y), idx, _output_y, slice);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}

diff --git a/src/core/CL/kernels/CLScharr3x3Kernel.h b/src/core/CL/kernels/CLScharr3x3Kernel.h
deleted file mode 100644
index a670da5..0000000
--- a/src/core/CL/kernels/CLScharr3x3Kernel.h
+++ /dev/null

@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSCHARR3X3KERNEL_H
-#define ARM_COMPUTE_CLSCHARR3X3KERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run a 3x3 Scharr filter on a tensor.
- *
- * @f[
- *      \mathbf{G}_x=\begin{vmatrix}
- *      -3 & 0 & +3\\
- *      -10& 0 & +10\\
- *      -3 & 0 & +3
- *      \end{vmatrix}
- * @f]
- * @f[
- *      \mathbf{G}_y=\begin{vmatrix}
- *      -3 & -10 & -3\\
- *       0 & 0 & 0\\
- *      +3 & +10 & +3
- *      \end{vmatrix}
- * @f]
- */
-class CLScharr3x3Kernel : public ICLKernel
-{
-public:
-    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
-    CLScharr3x3Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLScharr3x3Kernel(const CLScharr3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLScharr3x3Kernel &operator=(const CLScharr3x3Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLScharr3x3Kernel(CLScharr3x3Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLScharr3x3Kernel &operator=(CLScharr3x3Kernel &&) = default;
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    bool             _run_scharr_x; /**< Do we need to run Scharr X ? */
-    bool             _run_scharr_y; /**< Do we need to run Scharr Y ? */
-    const ICLTensor *_input;        /**< Input image */
-    ICLTensor       *_output_x;     /**< Output image for scharr X */
-    ICLTensor       *_output_y;     /**< Output image for scharr Y */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLSCHARR3X3KERNEL_H */

diff --git a/src/core/CL/kernels/CLSobel3x3Kernel.cpp b/src/core/CL/kernels/CLSobel3x3Kernel.cpp
deleted file mode 100644
index a87677a..0000000
--- a/src/core/CL/kernels/CLSobel3x3Kernel.cpp
+++ /dev/null

@@ -1,141 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLSobel3x3Kernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLSobel3x3Kernel::CLSobel3x3Kernel()
-    : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
-{
-}
-
-BorderSize CLSobel3x3Kernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void CLSobel3x3Kernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_undefined);
-}
-
-void CLSobel3x3Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
-    _run_sobel_x = output_x != nullptr;
-    _run_sobel_y = output_y != nullptr;
-
-    if(_run_sobel_x)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
-    }
-
-    if(_run_sobel_y)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
-    }
-
-    _input    = input;
-    _output_x = output_x;
-    _output_y = output_y;
-
-    // Set build options
-    std::set<std::string> build_opts;
-
-    if(_run_sobel_x)
-    {
-        build_opts.insert("-DGRAD_X");
-    }
-
-    if(_run_sobel_y)
-    {
-        build_opts.insert("-DGRAD_Y");
-    }
-
-    // Create kernel
-    const std::string kernel_name = std::string("sobel3x3");
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
-    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_access, output_x_access, output_y_access);
-
-    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_undefined);
-}
-
-void CLSobel3x3Kernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        add_2D_tensor_argument_if((_run_sobel_x), idx, _output_x, slice);
-        add_2D_tensor_argument_if((_run_sobel_y), idx, _output_y, slice);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}

diff --git a/src/core/CL/kernels/CLSobel3x3Kernel.h b/src/core/CL/kernels/CLSobel3x3Kernel.h
deleted file mode 100644
index fed8068..0000000
--- a/src/core/CL/kernels/CLSobel3x3Kernel.h
+++ /dev/null

@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSOBEL3X3KERNEL_H
-#define ARM_COMPUTE_CLSOBEL3X3KERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run a 3x3 Sobel filter on a tensor. */
-class CLSobel3x3Kernel : public ICLKernel
-{
-public:
-    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
-    CLSobel3x3Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel3x3Kernel(const CLSobel3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel3x3Kernel &operator=(const CLSobel3x3Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLSobel3x3Kernel(CLSobel3x3Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLSobel3x3Kernel &operator=(CLSobel3x3Kernel &&) = default;
-    /** Default destructor */
-    ~CLSobel3x3Kernel() = default;
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input;       /**< Input tensor */
-    ICLTensor       *_output_x;    /**< Output tensor for Sobel X */
-    ICLTensor       *_output_y;    /**< Output tensor for Sobel Y */
-    bool             _run_sobel_x; /**< Do we need to run Sobel X ? */
-    bool             _run_sobel_y; /**< Do we need to run Sobel Y ? */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLSOBEL3X3KERNEL_H */

diff --git a/src/core/CL/kernels/CLSobel5x5Kernel.cpp b/src/core/CL/kernels/CLSobel5x5Kernel.cpp
deleted file mode 100644
index c450bec..0000000
--- a/src/core/CL/kernels/CLSobel5x5Kernel.cpp
+++ /dev/null

@@ -1,251 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLSobel5x5Kernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLSobel5x5HorKernel::CLSobel5x5HorKernel()
-    : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
-{
-}
-
-BorderSize CLSobel5x5HorKernel::border_size() const
-{
-    return _border_size;
-}
-
-void CLSobel5x5HorKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_undefined);
-}
-
-void CLSobel5x5HorKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
-    _run_sobel_x = output_x != nullptr;
-    _run_sobel_y = output_y != nullptr;
-
-    if(_run_sobel_x)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
-    }
-
-    if(_run_sobel_y)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
-    }
-
-    _input       = input;
-    _output_x    = output_x;
-    _output_y    = output_y;
-    _border_size = BorderSize(border_undefined ? 0 : 2, 2);
-
-    // Set build options
-    std::set<std::string> build_opts;
-
-    if(_run_sobel_x)
-    {
-        build_opts.insert("-DGRAD_X");
-    }
-
-    if(_run_sobel_y)
-    {
-        build_opts.insert("-DGRAD_Y");
-    }
-
-    // Create kernel
-    const std::string kernel_name = std::string("sobel_separable1x5");
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-
-    Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration);
-    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
-    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_access, output_x_access, output_y_access);
-
-    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_undefined);
-}
-
-void CLSobel5x5HorKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        add_2D_tensor_argument_if((_run_sobel_x), idx, _output_x, slice);
-        add_2D_tensor_argument_if((_run_sobel_y), idx, _output_y, slice);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
-
-CLSobel5x5VertKernel::CLSobel5x5VertKernel()
-    : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
-{
-}
-
-BorderSize CLSobel5x5VertKernel::border_size() const
-{
-    return BorderSize{ 2, 0 };
-}
-
-void CLSobel5x5VertKernel::configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input_x, input_y, output_x, output_y, border_undefined);
-}
-
-void CLSobel5x5VertKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
-    _run_sobel_x = output_x != nullptr;
-    _run_sobel_y = output_y != nullptr;
-
-    if(_run_sobel_x)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_x, 1, DataType::S16);
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
-    }
-
-    if(_run_sobel_y)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_y, 1, DataType::S16);
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
-    }
-
-    _input_x  = input_x;
-    _input_y  = input_y;
-    _output_x = output_x;
-    _output_y = output_y;
-
-    // Set build options
-    std::set<std::string> build_opts;
-
-    if(_run_sobel_x)
-    {
-        build_opts.insert("-DGRAD_X");
-    }
-
-    if(_run_sobel_y)
-    {
-        build_opts.insert("-DGRAD_Y");
-    }
-
-    // Create kernel
-    const std::string kernel_name = std::string("sobel_separable5x1");
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts);
-
-    const ICLTensor *input = _run_sobel_x ? _input_x : _input_y;
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 8;
-    constexpr unsigned int num_rows_read_per_iteration       = 5;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_x_access(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowRectangle  input_y_access(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
-    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_x_access, input_y_access, output_x_access, output_y_access);
-
-    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_undefined);
-}
-
-void CLSobel5x5VertKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument_if((_run_sobel_x), idx, _input_x, slice);
-        add_2D_tensor_argument_if((_run_sobel_x), idx, _output_x, slice);
-        add_2D_tensor_argument_if((_run_sobel_y), idx, _input_y, slice);
-        add_2D_tensor_argument_if((_run_sobel_y), idx, _output_y, slice);
-
-        _kernel.setArg(idx++, 0 /*dummy*/);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}

diff --git a/src/core/CL/kernels/CLSobel5x5Kernel.h b/src/core/CL/kernels/CLSobel5x5Kernel.h
deleted file mode 100644
index a163ac9..0000000
--- a/src/core/CL/kernels/CLSobel5x5Kernel.h
+++ /dev/null

@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSOBEL5X5KERNEL_H
-#define ARM_COMPUTE_CLSOBEL5X5KERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run the horizontal pass of 5x5 Sobel filter on a tensor. */
-class CLSobel5x5HorKernel : public ICLKernel
-{
-public:
-    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
-    CLSobel5x5HorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel5x5HorKernel(const CLSobel5x5HorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel5x5HorKernel &operator=(const CLSobel5x5HorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLSobel5x5HorKernel(CLSobel5x5HorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLSobel5x5HorKernel &operator=(CLSobel5x5HorKernel &&) = default;
-    /** Default destructor */
-    ~CLSobel5x5HorKernel() = default;
-
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input;       /**< Input tensor */
-    ICLTensor       *_output_x;    /**< X output of horizontal pass */
-    ICLTensor       *_output_y;    /**< Y output of horizontal pass */
-    bool             _run_sobel_x; /**< Do we need to run Sobel X ? */
-    bool             _run_sobel_y; /**< Do we need to run Sobel Y ? */
-    BorderSize       _border_size; /**< Border size */
-};
-
-/** Interface for the kernel to run the vertical pass of 5x5 Sobel filter on a tensor. */
-class CLSobel5x5VertKernel : public ICLKernel
-{
-public:
-    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
-    CLSobel5x5VertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel5x5VertKernel(const CLSobel5x5VertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel5x5VertKernel &operator=(const CLSobel5x5VertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLSobel5x5VertKernel(CLSobel5x5VertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLSobel5x5VertKernel &operator=(CLSobel5x5VertKernel &&) = default;
-    /** Default destructor */
-    ~CLSobel5x5VertKernel() = default;
-
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set and the corresponding input.
-     *
-     * @param[in]  input_x          (Optional) Input for X (X output of horizontal pass). Data types supported: S16.
-     * @param[in]  input_y          (Optional) Input for Y (Y output of horizontal pass). Data types supported: S16.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set and the corresponding input.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input_x          (Optional) Input for X (X output of horizontal pass). Data types supported: S16.
-     * @param[in]  input_y          (Optional) Input for Y (Y output of horizontal pass). Data types supported: S16.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input_x;     /**< X input (X output of the horizontal pass) */
-    const ICLTensor *_input_y;     /**< Y input (Y output of the horizontal pass) */
-    ICLTensor       *_output_x;    /**< X output of sobel */
-    ICLTensor       *_output_y;    /**< Y output of sobel */
-    bool             _run_sobel_x; /**< Do we need to run sobel X? */
-    bool             _run_sobel_y; /**< Do we need to run sobel Y? */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLSOBEL5X5KERNEL_H */

diff --git a/src/core/CL/kernels/CLSobel7x7Kernel.cpp b/src/core/CL/kernels/CLSobel7x7Kernel.cpp
deleted file mode 100644
index 1cfa74f..0000000
--- a/src/core/CL/kernels/CLSobel7x7Kernel.cpp
+++ /dev/null

@@ -1,255 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLSobel7x7Kernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLSobel7x7HorKernel::CLSobel7x7HorKernel()
-    : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
-{
-}
-
-BorderSize CLSobel7x7HorKernel::border_size() const
-{
-    return _border_size;
-}
-
-void CLSobel7x7HorKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_undefined);
-}
-
-void CLSobel7x7HorKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
-    _run_sobel_x = output_x != nullptr;
-    _run_sobel_y = output_y != nullptr;
-
-    if(_run_sobel_x)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S32);
-    }
-
-    if(_run_sobel_y)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S32);
-    }
-
-    _input       = input;
-    _output_x    = output_x;
-    _output_y    = output_y;
-    _border_size = BorderSize(border_undefined ? 0 : 3, 3);
-
-    // Construct kernel name
-    const std::string kernel_name = "sobel_separable1x7";
-
-    // Set build options
-    std::set<std::string> build_opts;
-
-    if(_run_sobel_x)
-    {
-        build_opts.insert("-DGRAD_X");
-    }
-
-    if(_run_sobel_y)
-    {
-        build_opts.insert("-DGRAD_Y");
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-
-    Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration);
-    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
-    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_access, output_x_access, output_y_access);
-
-    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_undefined);
-}
-
-void CLSobel7x7HorKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        add_2D_tensor_argument_if((_run_sobel_x), idx, _output_x, slice);
-        add_2D_tensor_argument_if((_run_sobel_y), idx, _output_y, slice);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
-
-CLSobel7x7VertKernel::CLSobel7x7VertKernel()
-    : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
-{
-}
-
-BorderSize CLSobel7x7VertKernel::border_size() const
-{
-    return BorderSize{ 3, 0 };
-}
-
-void CLSobel7x7VertKernel::configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input_x, input_y, output_x, output_y, border_undefined);
-}
-
-void CLSobel7x7VertKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
-    _run_sobel_x = output_x != nullptr;
-    _run_sobel_y = output_y != nullptr;
-
-    if(_run_sobel_x)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_x, 1, DataType::S32);
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S32);
-    }
-
-    if(_run_sobel_y)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_y, 1, DataType::S32);
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S32);
-    }
-
-    _input_x  = input_x;
-    _input_y  = input_y;
-    _output_x = output_x;
-    _output_y = output_y;
-
-    // Set build options
-    std::set<std::string> build_opts;
-
-    if(_run_sobel_x)
-    {
-        build_opts.insert("-DGRAD_X");
-    }
-
-    if(_run_sobel_y)
-    {
-        build_opts.insert("-DGRAD_Y");
-    }
-
-    // Create kernel
-    const std::string kernel_name = std::string("sobel_separable7x1");
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts);
-
-    const ICLTensor *input = _run_sobel_x ? _input_x : _input_y;
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 8;
-    constexpr unsigned int num_rows_read_per_iteration       = 7;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_x_access(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowRectangle  input_y_access(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
-    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_x_access, input_y_access, output_x_access, output_y_access);
-
-    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_undefined);
-}
-
-void CLSobel7x7VertKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-
-    do
-    {
-        unsigned int idx = 0;
-
-        add_2D_tensor_argument_if((_run_sobel_x), idx, _input_x, slice);
-        add_2D_tensor_argument_if((_run_sobel_x), idx, _output_x, slice);
-        add_2D_tensor_argument_if((_run_sobel_y), idx, _input_y, slice);
-        add_2D_tensor_argument_if((_run_sobel_y), idx, _output_y, slice);
-
-        _kernel.setArg(idx++, 0 /*dummy*/);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}

diff --git a/src/core/CL/kernels/CLSobel7x7Kernel.h b/src/core/CL/kernels/CLSobel7x7Kernel.h
deleted file mode 100644
index c85f0ae..0000000
--- a/src/core/CL/kernels/CLSobel7x7Kernel.h
+++ /dev/null

@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSOBEL7X7KERNEL_H
-#define ARM_COMPUTE_CLSOBEL7X7KERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run the horizontal pass of 7x7 Sobel filter on a tensor. */
-class CLSobel7x7HorKernel : public ICLKernel
-{
-public:
-    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
-    CLSobel7x7HorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel7x7HorKernel(const CLSobel7x7HorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel7x7HorKernel &operator=(const CLSobel7x7HorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLSobel7x7HorKernel(CLSobel7x7HorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLSobel7x7HorKernel &operator=(CLSobel7x7HorKernel &&) = default;
-    /** Default destructor */
-    ~CLSobel7x7HorKernel() = default;
-
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S32.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S32.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S32.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S32.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input;       /**< Input tensor */
-    ICLTensor       *_output_x;    /**< X output of horizontal pass */
-    ICLTensor       *_output_y;    /**< Y output of horizontal pass */
-    bool             _run_sobel_x; /**< Do we need to run Sobel X ? */
-    bool             _run_sobel_y; /**< Do we need to run Sobel Y ? */
-    BorderSize       _border_size; /**< Border size */
-};
-
-/** Interface for the kernel to run the vertical pass of 7x7 Sobel filter on a tensor. */
-class CLSobel7x7VertKernel : public ICLKernel
-{
-public:
-    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
-    CLSobel7x7VertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel7x7VertKernel(const CLSobel7x7VertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel7x7VertKernel &operator=(const CLSobel7x7VertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLSobel7x7VertKernel(CLSobel7x7VertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLSobel7x7VertKernel &operator=(CLSobel7x7VertKernel &&) = default;
-    /** Default destructor */
-    ~CLSobel7x7VertKernel() = default;
-
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set and the corresponding input.
-     *
-     * @param[in]  input_x          (Optional) Input for X (X output of horizontal pass). Data types supported: S32.
-     * @param[in]  input_y          (Optional) Input for Y (Y output of horizontal pass). Data types supported: S32.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S32.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S32.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set and the corresponding input.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input_x          (Optional) Input for X (X output of horizontal pass). Data types supported: S32.
-     * @param[in]  input_y          (Optional) Input for Y (Y output of horizontal pass). Data types supported: S32.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S32.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S32.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input_x;     /**< X input (X output of the horizontal pass) */
-    const ICLTensor *_input_y;     /**< Y input (Y output of the horizontal pass) */
-    ICLTensor       *_output_x;    /**< X output of sobel */
-    ICLTensor       *_output_y;    /**< Y output of sobel */
-    bool             _run_sobel_x; /**< Do we need to run sobel X? */
-    bool             _run_sobel_y; /**< Do we need to run sobel Y? */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLSOBEL7X7KERNEL_H */

diff --git a/src/core/CL/kernels/CLTableLookupKernel.cpp b/src/core/CL/kernels/CLTableLookupKernel.cpp
deleted file mode 100644
index b82f4c9..0000000
--- a/src/core/CL/kernels/CLTableLookupKernel.cpp
+++ /dev/null

@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLTableLookupKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLLut.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-
-#include <cstdint>
-#include <string>
-
-using namespace arm_compute;
-
-void CLTableLookupKernel::configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, lut, output);
-}
-
-void CLTableLookupKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLLut *lut, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON(lut == nullptr);
-    ARM_COMPUTE_ERROR_ON(DataType::U8 != lut->type() && DataType::S16 != lut->type());
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-    // Create kernel
-    std::string kernel_name = (DataType::S16 == lut->type()) ? "tablelookup_S16" : "tablelookup_U8";
-    _kernel                 = create_kernel(compile_context, kernel_name);
-
-    // Set lut argument
-    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg(idx++, lut->cl_buffer());
-    if(DataType::S16 == lut->type())
-    {
-        _kernel.setArg(idx++, lut->index_offset());
-        _kernel.setArg(idx++, static_cast<uint32_t>(lut->num_elements()));
-    }
-
-    // Configure kernel
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
-}

diff --git a/src/core/CL/kernels/CLTableLookupKernel.h b/src/core/CL/kernels/CLTableLookupKernel.h
deleted file mode 100644
index c8d15cb..0000000
--- a/src/core/CL/kernels/CLTableLookupKernel.h
+++ /dev/null

@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLTABLELOOKUPKERNEL_H
-#define ARM_COMPUTE_CLTABLELOOKUPKERNEL_H
-
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-class ICLLut;
-
-/** Interface for the kernel to perform table lookup calculations. */
-class CLTableLookupKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input, lut and output.
-     *
-     * @param[in]  input  An input tensor. Data types supported: U8, S16.
-     * @param[in]  lut    The input LUT. Data types supported: U8, S16.
-     * @param[out] output The output tensor. Data types supported: U8, S16.
-     */
-    void configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output);
-    /** Initialise the kernel's input, lut and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           An input tensor. Data types supported: U8, S16.
-     * @param[in]  lut             The input LUT. Data types supported: U8, S16.
-     * @param[out] output          The output tensor. Data types supported: U8, S16.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLLut *lut, ICLTensor *output);
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLTABLELOOKUPKERNEL_H */

diff --git a/src/core/CL/kernels/CLThresholdKernel.cpp b/src/core/CL/kernels/CLThresholdKernel.cpp
deleted file mode 100644
index 72c22f0..0000000
--- a/src/core/CL/kernels/CLThresholdKernel.cpp
+++ /dev/null

@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLThresholdKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <string>
-
-namespace arm_compute
-{
-void CLThresholdKernel::configure(const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
-}
-
-void CLThresholdKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
-    // Construct kernel name
-    std::string kernel_name = "threshold";
-
-    switch(info.type)
-    {
-        case ThresholdType::BINARY:
-            kernel_name += "_binary";
-            break;
-        case ThresholdType::RANGE:
-            kernel_name += "_range";
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Thresholding type not recognized");
-            break;
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name);
-
-    // Set arguments
-    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg(idx++, info.false_value);
-    _kernel.setArg(idx++, info.true_value);
-    _kernel.setArg(idx++, info.threshold);
-
-    if(ThresholdType::RANGE == info.type)
-    {
-        _kernel.setArg(idx++, info.upper);
-    }
-
-    // Make sure _kernel is initialized before calling the parent's configure
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-    ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
-}
-} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLThresholdKernel.h b/src/core/CL/kernels/CLThresholdKernel.h
deleted file mode 100644
index 511eaed..0000000
--- a/src/core/CL/kernels/CLThresholdKernel.h
+++ /dev/null

@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLTHRESHOLDKERNEL_H
-#define ARM_COMPUTE_CLTHRESHOLDKERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** Interface for the thresholding kernel. */
-class CLThresholdKernel : public ICLSimple2DKernel
-{
-public:
-    /**Initialise the kernel's input, output and threshold parameters.
-     *
-     * @param[in]  input  An input tensor. Data types supported: U8
-     * @param[out] output The output tensor. Data types supported: U8.
-     * @param[in]  info   Threshold descriptor
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info);
-    /**Initialise the kernel's input, output and threshold parameters.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           An input tensor. Data types supported: U8
-     * @param[out] output          The output tensor. Data types supported: U8.
-     * @param[in]  info            Threshold descriptor
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info);
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NETHRESHOLDKERNEL_H */

diff --git a/src/core/CL/kernels/CLWarpAffineKernel.cpp b/src/core/CL/kernels/CLWarpAffineKernel.cpp
deleted file mode 100644
index 600c67a..0000000
--- a/src/core/CL/kernels/CLWarpAffineKernel.cpp
+++ /dev/null

@@ -1,133 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLWarpAffineKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-#include <set>
-#include <sstream>
-#include <string>
-
-namespace arm_compute
-{
-namespace
-{
-void options_add_matrix(std::set<std::string> &options, const std::array<float, 9> &matrix)
-{
-    for(size_t i = 0; i < 6; ++i)
-    {
-        std::stringstream mat_str;
-        mat_str << "-DMAT" << i << "=" << matrix[i] << " ";
-        options.insert(mat_str.str());
-    }
-}
-} // namespace
-
-BorderSize CLWarpAffineKernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void CLWarpAffineKernel::configure(const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, matrix, policy);
-}
-
-void CLWarpAffineKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(InterpolationPolicy::AREA == policy);
-
-    _input  = input;
-    _output = output;
-
-    // Create build options
-    std::set<std::string> options;
-    options_add_matrix(options, matrix);
-    options.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-
-    // Create kernel
-    std::string interpolation_name = string_from_interpolation_policy(policy);
-    std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
-    const std::string kernel_name = "warp_affine_" + interpolation_name;
-    _kernel                       = create_kernel(compile_context, kernel_name, options);
-
-    // Set static kernel arguments
-    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg<cl_int>(idx++, input->info()->dimension(0));
-    _kernel.setArg<cl_int>(idx++, input->info()->dimension(1));
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
-
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-
-    int       total_right  = ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration);
-    const int access_right = total_right + (((total_right - input->info()->dimension(0)) == 0) ? border_size().right : 0);
-
-    AccessWindowStatic     input_access(input->info(), -border_size().left, -border_size().top, access_right, input->info()->dimension(1) + border_size().bottom);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(3));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(3));
-    _config_id += "_";
-    _config_id += lower_string(string_from_interpolation_policy(policy));
-}
-} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLWarpAffineKernel.h b/src/core/CL/kernels/CLWarpAffineKernel.h
deleted file mode 100644
index c600ee7..0000000
--- a/src/core/CL/kernels/CLWarpAffineKernel.h
+++ /dev/null

@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWARPAFFINEKERNEL_H
-#define ARM_COMPUTE_CLWARPAFFINEKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the warp affine kernel.*/
-class CLWarpAffineKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialize the function's source, destination, interpolation policy and border_mode.
-     *
-     * @param[in]  input  Source tensor. Data types supported: U8.
-     * @param[out] output Destination tensor, Data types supported: U8.
-     * @param[in]  matrix The perspective matrix. Must be 2x3 of type float
-     *                    The matrix argument requires 9 values, the last 3 values are ignored.
-     * @param[in]  policy The interpolation type.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy);
-    /** Initialize the function's source, destination, interpolation policy and border_mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8.
-     * @param[out] output          Destination tensor, Data types supported: U8.
-     * @param[in]  matrix          The perspective matrix. Must be 2x3 of type float
-     *                             The matrix argument requires 9 values, the last 3 values are ignored.
-     * @param[in]  policy          The interpolation type.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLWARPAFFINEKERNEL_H */

diff --git a/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
deleted file mode 100644
index 5f20a0b..0000000
--- a/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
+++ /dev/null

@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLWarpPerspectiveKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <cstddef>
-#include <set>
-#include <sstream>
-#include <string>
-
-using namespace arm_compute;
-
-namespace
-{
-inline void options_add_matrix(std::set<std::string> &options, const std::array<float, 9> &matrix)
-{
-    for(size_t i = 0; i < 9; ++i)
-    {
-        std::stringstream mat_str;
-        mat_str << "-DMAT" << i << "=" << matrix[i] << " ";
-        options.insert(mat_str.str());
-    }
-}
-} // namespace
-
-BorderSize CLWarpPerspectiveKernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void CLWarpPerspectiveKernel::configure(const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, matrix, policy);
-}
-
-void CLWarpPerspectiveKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(InterpolationPolicy::AREA == policy);
-
-    _input  = input;
-    _output = output;
-
-    // Create build options
-    std::set<std::string> options;
-    options_add_matrix(options, matrix);
-    options.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-
-    // Create kernel
-    std::string interpolation_name = string_from_interpolation_policy(policy);
-    std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
-    std::string kernel_name = "warp_perspective_" + interpolation_name;
-    _kernel                 = create_kernel(compile_context, kernel_name, options);
-
-    // Set static kernel arguments
-    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg<cl_int>(idx++, input->info()->dimension(0));
-    _kernel.setArg<cl_int>(idx++, input->info()->dimension(1));
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
-
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowStatic     input_access(input->info(), -border_size().left, -border_size().top, input->info()->dimension(0) + border_size().right, input->info()->dimension(1) + border_size().bottom);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    ICLKernel::configure_internal(win);
-}

diff --git a/src/core/CL/kernels/CLWarpPerspectiveKernel.h b/src/core/CL/kernels/CLWarpPerspectiveKernel.h
deleted file mode 100644
index dcbe1c5..0000000
--- a/src/core/CL/kernels/CLWarpPerspectiveKernel.h
+++ /dev/null

@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWARPERSPECTIVEKERNEL_H
-#define ARM_COMPUTE_CLWARPERSPECTIVEKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-/** Interface for the warp perspective kernel.*/
-class CLWarpPerspectiveKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialize the function's source, destination, interpolation policy and border_mode.
-     *
-     * @param[in]  input  Source tensor. Data types supported: U8.
-     * @param[out] output Destination tensor, Data types supported: U8.
-     * @param[in]  matrix The perspective matrix. Must be 3x3 of type float.
-     * @param[in]  policy The interpolation type.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy);
-    /** Initialize the function's source, destination, interpolation policy and border_mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8.
-     * @param[out] output          Destination tensor, Data types supported: U8.
-     * @param[in]  matrix          The perspective matrix. Must be 3x3 of type float.
-     * @param[in]  policy          The interpolation type.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLWARPERSPECTIVEKERNEL_H */

diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h
index aea245c..b2c5592 100644
--- a/src/core/NEON/NEKernels.h
+++ b/src/core/NEON/NEKernels.h

@@ -35,7 +35,6 @@
 #include "src/core/NEON/kernels/NECol2ImKernel.h"
 #include "src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
 #include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "src/core/NEON/kernels/NEConvolutionKernel.h"
 #include "src/core/NEON/kernels/NECropKernel.h"
 #include "src/core/NEON/kernels/NECumulativeDistributionKernel.h"
 #include "src/core/NEON/kernels/NEDepthConvertLayerKernel.h"
@@ -72,7 +71,6 @@
 #include "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h"
 #include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
 #include "src/core/NEON/kernels/NEMinMaxLayerKernel.h"
-#include "src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
 #include "src/core/NEON/kernels/NENormalizationLayerKernel.h"
 #include "src/core/NEON/kernels/NEPadLayerKernel.h"
 #include "src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
@@ -83,6 +81,7 @@
 #include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
 #include "src/core/NEON/kernels/NERangeKernel.h"
 #include "src/core/NEON/kernels/NEReductionOperationKernel.h"
+#include "src/core/NEON/kernels/NERemapKernel.h"
 #include "src/core/NEON/kernels/NEReorgLayerKernel.h"
 #include "src/core/NEON/kernels/NEReverseKernel.h"
 #include "src/core/NEON/kernels/NEScaleKernel.h"

diff --git a/src/core/NEON/kernels/NEConvolutionKernel.cpp b/src/core/NEON/kernels/NEConvolutionKernel.cpp
deleted file mode 100644
index 075de41..0000000
--- a/src/core/NEON/kernels/NEConvolutionKernel.cpp
+++ /dev/null

@@ -1,1625 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEConvolutionKernel.h"
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <algorithm>
-#include <arm_neon.h>
-#include <array>
-#include <cstdint>
-#include <cstring>
-#include <tuple>
-
-namespace arm_compute
-{
-namespace
-{
-const uint16x8_t max_int16 = vdupq_n_u16(INT16_MAX);
-
-inline void store_results(const int32x4_t &out, const int32x4_t &out2, int16_t *output)
-{
-    const int16x8_t s16results = vcombine_s16(vqmovn_s32(out),
-                                              vqmovn_s32(out2));
-    vst1q_s16(output, s16results);
-}
-
-inline void store_results(const int32x4_t &out, const int32x4_t &out2, uint8_t *output)
-{
-    const uint8x8_t u8results = vqmovn_u16(vcombine_u16(vqmovun_s32(out),
-                                                        vqmovun_s32(out2)));
-    vst1_u8(output, u8results);
-}
-
-inline void store_results(const uint32x4_t &out, const uint32x4_t &out2, int16_t *output)
-{
-    const uint16x8_t u16results = vcombine_u16(vqmovn_u32(out), vqmovn_u32(out2));
-    const int16x8_t  s16results = vreinterpretq_s16_u16(vminq_u16(u16results, max_int16));
-    vst1q_s16(output, s16results);
-}
-
-inline void store_results(const uint32x4_t &out, const uint32x4_t &out2, uint8_t *output)
-{
-    const uint8x8_t u8results = vqmovn_u16(vcombine_u16(vqmovn_u32(out),
-                                                        vqmovn_u32(out2)));
-    vst1_u8(output, u8results);
-}
-
-inline void store_results(const int16x8_t &out, const int16x8_t &out2, int16_t *output)
-{
-    vst1q_s16(output, out);
-    vst1q_s16(output + 8, out2);
-}
-
-inline void store_results(const int16x8_t &out, const int16x8_t &out2, uint8_t *output)
-{
-    const uint8x16_t u8results = vcombine_u8(vqmovun_s16(out),
-                                             vqmovun_s16(out2));
-    vst1q_u8(output, u8results);
-}
-
-inline void store_results(const uint16x8_t &out, const uint16x8_t &out2, uint8_t *output)
-{
-    const uint8x16_t u8results = vcombine_u8(vqmovn_u16(out),
-                                             vqmovn_u16(out2));
-    vst1q_u8(output, u8results);
-}
-
-inline void store_results(const uint16x8_t &out, const uint16x8_t &out2, int16_t *output)
-{
-    vst1q_s16(output, vreinterpretq_s16_u16(vminq_u16(out, max_int16)));
-    vst1q_s16(output + 8, vreinterpretq_s16_u16(vminq_u16(out2, max_int16)));
-}
-
-inline void convolve_row3x1_unrolled(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16x4_t &mat0, const int16x4_t &mat1, const int16x4_t &mat2)
-{
-    // Convert to s16 and split in blocks of 4 values:
-    const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
-    const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
-
-    const int16x4x3_t row =
-    {
-        {
-            vget_low_s16(s16_tmp0),
-            vget_high_s16(s16_tmp0),
-            vget_low_s16(s16_tmp1)
-        }
-    };
-
-    // Calculate row left value for pixels [0,3]
-    out = vmlal_s16(out, row.val[0], mat0);
-    // Calculate row middle value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
-    // Calculate row right value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
-
-    // Calculate row left value for pixels [4,7]
-    out2 = vmlal_s16(out2, row.val[1], mat0);
-    // Calculate row middle value for pixels [4,7]
-    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
-    // Calculate row right value for pixels [4,7]
-    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
-}
-
-inline void convolve_row3x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
-{
-    const int16x4_t mat0 = vld1_dup_s16(convolution);
-    const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
-    const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
-
-    convolve_row3x1_unrolled(out, out2, row_data, mat0, mat1, mat2);
-}
-
-inline void convolve_row5x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
-{
-    const int16x4_t mat0 = vld1_dup_s16(convolution);
-    const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
-    const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
-    const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
-    const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
-
-    // Convert to s16 and split in blocks of 4 values:
-    const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
-    const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
-
-    const int16x4x3_t row =
-    {
-        {
-            vget_low_s16(s16_tmp0),
-            vget_high_s16(s16_tmp0),
-            vget_low_s16(s16_tmp1)
-        }
-    };
-
-    // Calculate row left 2 value for pixels [0,3]
-    out = vmlal_s16(out, row.val[0], mat0);
-    // Calculate row left 1 value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
-    // Calculate row middle value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
-    // Calculate row right +1 value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
-    // Calculate row right +2 value for pixels [0,3]
-    out = vmlal_s16(out, row.val[1], mat4);
-
-    // Calculate row left 2 value for pixels [4,7]
-    out2 = vmlal_s16(out2, row.val[1], mat0);
-    // Calculate row left 1 value for pixels [4,7]
-    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
-    // Calculate row middle value for pixels [4,7]
-    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
-    // Calculate row right +1 value for pixels [4,7]
-    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
-    // Calculate row right +2 value for pixels [4,7]
-    out2 = vmlal_s16(out2, row.val[2], mat4);
-}
-
-inline void convolve_row7x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
-{
-    const int16x4_t mat0 = vld1_dup_s16(convolution);
-    const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
-    const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
-    const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
-    const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
-    const int16x4_t mat5 = vld1_dup_s16(convolution + 5);
-    const int16x4_t mat6 = vld1_dup_s16(convolution + 6);
-
-    // Convert to s16 and split in blocks of 4 values:
-    const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
-    const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
-
-    const int16x4x4_t row =
-    {
-        {
-            vget_low_s16(s16_tmp0),
-            vget_high_s16(s16_tmp0),
-            vget_low_s16(s16_tmp1),
-            vget_high_s16(s16_tmp1)
-        }
-    };
-
-    // Calculate row left 3 value for pixels [0,3]
-    out = vmlal_s16(out, row.val[0], mat0);
-    // Calculate row left 2 value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
-    // Calculate row left 1 value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
-    // Calculate row middle value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
-    // Calculate row right +1 value for pixels [0,3]
-    out = vmlal_s16(out, row.val[1], mat4);
-    // Calculate row right +2 value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 1), mat5);
-    // Calculate row right +3 value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 2), mat6);
-
-    // Calculate row left 3 value for pixels [4,7]
-    out2 = vmlal_s16(out2, row.val[1], mat0);
-    // Calculate row left 2 value for pixels [4,7]
-    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
-    // Calculate row left 1 value for pixels [4,7]
-    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
-    // Calculate row middle value for pixels [4,7]
-    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
-    // Calculate row right +1 value for pixels [4,7]
-    out2 = vmlal_s16(out2, row.val[2], mat4);
-    // Calculate row right +2 value for pixels [4,7]
-    out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 1), mat5);
-    // Calculate row right +3 value for pixels [4,7]
-    out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 2), mat6);
-}
-
-inline void convolve_row9x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
-{
-    const int16x4_t mat0 = vld1_dup_s16(convolution);
-    const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
-    const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
-    const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
-    const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
-    const int16x4_t mat5 = vld1_dup_s16(convolution + 5);
-    const int16x4_t mat6 = vld1_dup_s16(convolution + 6);
-    const int16x4_t mat7 = vld1_dup_s16(convolution + 7);
-    const int16x4_t mat8 = vld1_dup_s16(convolution + 8);
-
-    // Convert to s16 and split in blocks of 4 values:
-    const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
-    const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
-
-    const int16x4x4_t row =
-    {
-        {
-            vget_low_s16(s16_tmp0),
-            vget_high_s16(s16_tmp0),
-            vget_low_s16(s16_tmp1),
-            vget_high_s16(s16_tmp1)
-        }
-    };
-
-    // Calculate row left 4 value for pixels [0,3]
-    out = vmlal_s16(out, row.val[0], mat0);
-    // Calculate row left 3 value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
-    // Calculate row left 2 value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
-    // Calculate row left 1 value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
-    // Calculate row middle value for pixels [0,3]
-    out = vmlal_s16(out, row.val[1], mat4);
-    // Calculate row right +1 value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 1), mat5);
-    // Calculate row right +2 value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 2), mat6);
-    // Calculate row right +3 value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 3), mat7);
-    // Calculate row right +4 value for pixels [0,3]
-    out = vmlal_s16(out, row.val[2], mat8);
-
-    // Calculate row left 4 value for pixels [0,3]
-    out2 = vmlal_s16(out2, row.val[1], mat0);
-    // Calculate row left 3 value for pixels [0,3]
-    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
-    // Calculate row left 2 value for pixels [0,3]
-    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
-    // Calculate row left 1 value for pixels [0,3]
-    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
-    // Calculate row middle value for pixels [0,3]
-    out2 = vmlal_s16(out2, row.val[2], mat4);
-    // Calculate row right +1 value for pixels [0,3]
-    out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 1), mat5);
-    // Calculate row right +2 value for pixels [0,3]
-    out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 2), mat6);
-    // Calculate row right +3 value for pixels [0,3]
-    out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 3), mat7);
-    // Calculate row right +4 value for pixels [0,3]
-    out2 = vmlal_s16(out2, row.val[3], mat8);
-}
-} // namespace
-
-/****************************************************************************************\
- *                                    Square Convolution                                *
-\****************************************************************************************/
-
-template <unsigned int matrix_size>
-NEConvolutionKernel<matrix_size>::NEConvolutionKernel()
-    : INESimpleKernel(), _scale(0), _convolution{ {} }
-{
-}
-
-template <unsigned int matrix_size>
-BorderSize             NEConvolutionKernel<matrix_size>::border_size() const
-{
-    return BorderSize{ matrix_size / 2 };
-}
-
-template <unsigned int matrix_size>
-void NEConvolutionKernel<matrix_size>::configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv);
-
-    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-
-    _input  = input;
-    _output = output;
-
-    std::copy_n(conv, _convolution.size(), _convolution.begin());
-
-    if(scale == 0)
-    {
-        _scale = calculate_matrix_scale(_convolution.data(), matrix_size);
-    }
-    else
-    {
-        _scale = scale;
-    }
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, matrix_size),
-                              output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
-
-template <>
-template <typename OutputType>
-void NEConvolutionKernel<3>::convolution(const Window &win)
-{
-    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
-    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
-
-    Iterator input(_input, win);
-    Iterator output(_output, win);
-
-    // Load the matrix's coefficients into Neon registers:
-    const int16x4_t   mat00     = vld1_dup_s16(_convolution.data());
-    const int16x4_t   mat01     = vld1_dup_s16(_convolution.data() + 1);
-    const int16x4_t   mat02     = vld1_dup_s16(_convolution.data() + 2);
-    const int16x4_t   mat10     = vld1_dup_s16(_convolution.data() + 3);
-    const int16x4_t   mat11     = vld1_dup_s16(_convolution.data() + 4);
-    const int16x4_t   mat12     = vld1_dup_s16(_convolution.data() + 5);
-    const int16x4_t   mat20     = vld1_dup_s16(_convolution.data() + 6);
-    const int16x4_t   mat21     = vld1_dup_s16(_convolution.data() + 7);
-    const int16x4_t   mat22     = vld1_dup_s16(_convolution.data() + 8);
-    const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
-
-    const unsigned char *input_top_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, -1));
-    const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 0));
-    const unsigned char *input_low_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 1));
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int32x4_t out  = vdupq_n_s32(0);
-        int32x4_t out2 = vdupq_n_s32(0);
-
-        // Load 16 bytes from the top row:
-        const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
-        convolve_row3x1_unrolled(out, out2, top_data, mat00, mat01, mat02);
-
-        // Load 16 bytes from the middle row:
-        const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
-        convolve_row3x1_unrolled(out, out2, mid_data, mat10, mat11, mat12);
-
-        // Load 16 bytes from the middle row:
-        const uint8x16_t low_data = vld1q_u8(input_low_ptr + input.offset());
-        convolve_row3x1_unrolled(out, out2, low_data, mat20, mat21, mat22);
-
-        // Apply scale
-        if(_scale != 1)
-        {
-            // Convert to F32, scale and convert back to S32
-            out  = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
-            out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
-        }
-
-        // Clamp and store as U8 or S16:
-        store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
-    },
-    input, output);
-}
-
-template <>
-template <typename OutputType>
-void NEConvolutionKernel<5>::convolution(const Window &win)
-{
-    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
-    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
-
-    Iterator input(_input, win);
-    Iterator output(_output, win);
-
-    const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
-
-    const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, -2));
-    const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, -1));
-    const unsigned char *input_mid_ptr  = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 0));
-    const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 1));
-    const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 2));
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int32x4_t out  = vdupq_n_s32(0);
-        int32x4_t out2 = vdupq_n_s32(0);
-
-        // Load 16 bytes from the top2 row:
-        const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
-        convolve_row5x1(out, out2, data_t2, _convolution.data());
-
-        // Load 16 bytes from the top1 row:
-        const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
-        convolve_row5x1(out, out2, data_t1, _convolution.data() + 5);
-
-        // Load 16 bytes from the middle row:
-        const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
-        convolve_row5x1(out, out2, data_m, _convolution.data() + 10);
-
-        // Load 16 bytes from the low1 row:
-        const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
-        convolve_row5x1(out, out2, data_b1, _convolution.data() + 15);
-
-        // Load 16 bytes from the low2 row:
-        const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
-        convolve_row5x1(out, out2, data_b2, _convolution.data() + 20);
-
-        // Apply scale
-        if(_scale != 1)
-        {
-            // Convert to F32, scale and convert back to S32
-            out  = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
-            out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
-        }
-
-        // Clamp and store as U8 or S16:
-        store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
-    },
-    input, output);
-}
-
-template <>
-template <typename OutputType>
-void NEConvolutionKernel<7>::convolution(const Window &win)
-{
-    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
-    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
-
-    Iterator input(_input, win);
-    Iterator output(_output, win);
-
-    const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
-
-    const unsigned char *input_top3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -3));
-    const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -2));
-    const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -1));
-    const unsigned char *input_mid_ptr  = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 0));
-    const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 1));
-    const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 2));
-    const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 3));
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int32x4_t out  = vdupq_n_s32(0);
-        int32x4_t out2 = vdupq_n_s32(0);
-
-        // Load 16 bytes from the top3 row:
-        const uint8x16_t data_t3 = vld1q_u8(input_top3_ptr + input.offset());
-        convolve_row7x1(out, out2, data_t3, _convolution.data());
-
-        // Load 16 bytes from the top2 row:
-        const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
-        convolve_row7x1(out, out2, data_t2, _convolution.data() + 7);
-
-        // Load 16 bytes from the top1 row:
-        const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
-        convolve_row7x1(out, out2, data_t1, _convolution.data() + 14);
-
-        // Load 16 bytes from the middle row:
-        const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
-        convolve_row7x1(out, out2, data_m, _convolution.data() + 21);
-
-        // Load 16 bytes from the low1 row:
-        const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
-        convolve_row7x1(out, out2, data_b1, _convolution.data() + 28);
-
-        // Load 16 bytes from the low2 row:
-        const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
-        convolve_row7x1(out, out2, data_b2, _convolution.data() + 35);
-
-        // Load 16 bytes from the low3 row:
-        const uint8x16_t data_b3 = vld1q_u8(input_low3_ptr + input.offset());
-        convolve_row7x1(out, out2, data_b3, _convolution.data() + 42);
-
-        // Apply scale
-        if(_scale != 1)
-        {
-            // Convert to F32, scale and convert back to S32
-            out  = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
-            out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
-        }
-
-        // Clamp and store as U8 or S16:
-        store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
-    },
-    input, output);
-}
-
-template <>
-template <typename OutputType>
-void NEConvolutionKernel<9>::convolution(const Window &win)
-{
-    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
-    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
-
-    Iterator input(_input, win);
-    Iterator output(_output, win);
-
-    const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
-
-    const unsigned char *input_top4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -4));
-    const unsigned char *input_top3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -3));
-    const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -2));
-    const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -1));
-    const unsigned char *input_mid_ptr  = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 0));
-    const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 1));
-    const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 2));
-    const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 3));
-    const unsigned char *input_low4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 4));
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int32x4_t out  = vdupq_n_s32(0);
-        int32x4_t out2 = vdupq_n_s32(0);
-
-        // Load 16 bytes from the top4 row:
-        const uint8x16_t data_t4 = vld1q_u8(input_top4_ptr + input.offset());
-        convolve_row9x1(out, out2, data_t4, _convolution.data());
-
-        // Load 16 bytes from the top3 row:
-        const uint8x16_t data_t3 = vld1q_u8(input_top3_ptr + input.offset());
-        convolve_row9x1(out, out2, data_t3, _convolution.data() + 9);
-
-        // Load 16 bytes from the top2 row:
-        const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
-        convolve_row9x1(out, out2, data_t2, _convolution.data() + 18);
-
-        // Load 16 bytes from the top1 row:
-        const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
-        convolve_row9x1(out, out2, data_t1, _convolution.data() + 27);
-
-        // Load 16 bytes from the middle row:
-        const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
-        convolve_row9x1(out, out2, data_m, _convolution.data() + 36);
-
-        // Load 16 bytes from the low1 row:
-        const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
-        convolve_row9x1(out, out2, data_b1, _convolution.data() + 45);
-
-        // Load 16 bytes from the low2 row:
-        const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
-        convolve_row9x1(out, out2, data_b2, _convolution.data() + 54);
-
-        // Load 16 bytes from the low3 row:
-        const uint8x16_t data_b3 = vld1q_u8(input_low3_ptr + input.offset());
-        convolve_row9x1(out, out2, data_b3, _convolution.data() + 63);
-
-        // Load 16 bytes from the low4 row:
-        const uint8x16_t data_b4 = vld1q_u8(input_low4_ptr + input.offset());
-        convolve_row9x1(out, out2, data_b4, _convolution.data() + 72);
-
-        // Apply scale
-        if(_scale != 1)
-        {
-            // Convert to F32, scale and convert back to S32
-            out  = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
-            out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
-        }
-
-        // Clamp and store as U8 or S16:
-        store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
-    },
-    input, output);
-}
-
-template <unsigned int matrix_size>
-void NEConvolutionKernel<matrix_size>::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    switch(_output->info()->data_type())
-    {
-        case DataType::U8:
-            convolution<uint8_t>(window);
-            break;
-        case DataType::S16:
-            convolution<int16_t>(window);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not supported Data type!");
-            break;
-    }
-}
-
-template class arm_compute::NEConvolutionKernel<3>;
-template class arm_compute::NEConvolutionKernel<5>;
-template class arm_compute::NEConvolutionKernel<7>;
-template class arm_compute::NEConvolutionKernel<9>;
-
-/****************************************************************************************\
- *                              Separable Square Convolution                            *
-\****************************************************************************************/
-
-template <unsigned int matrix_size>
-NESeparableConvolutionHorKernel<matrix_size>::NESeparableConvolutionHorKernel()
-    : _conv_row{ { 0 } }, _border_size(0)
-{
-}
-
-template <unsigned int matrix_size>
-BorderSize             NESeparableConvolutionHorKernel<matrix_size>::border_size() const
-{
-    return _border_size;
-}
-
-template <unsigned int matrix_size>
-void NESeparableConvolutionHorKernel<matrix_size>::configure(const ITensor *input, ITensor *output, const int16_t *conv_row, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv_row);
-
-    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::S16, DataType::S32);
-
-    _input  = input;
-    _output = output;
-    std::copy_n(conv_row, _conv_row.size(), _conv_row.begin());
-    _border_size = BorderSize(border_undefined ? 0 : matrix_size / 2, matrix_size / 2);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-
-    Window                 win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration),
-                              output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
-
-template <unsigned int matrix_size>
-void NESeparableConvolutionHorKernel<matrix_size>::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    switch(_output->info()->data_type())
-    {
-        case DataType::U16:
-            convolve<uint16_t>(window);
-            break;
-        case DataType::S16:
-            convolve<int16_t>(window);
-            break;
-        case DataType::S32:
-            convolve<int32_t>(window);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported intermediate data type!");
-            break;
-    }
-}
-
-template <>
-template <>
-inline void NESeparableConvolutionHorKernel<5>::convolve<uint16_t>(const Window &window)
-{
-    Window win_in(window);
-    win_in.shift(Window::DimX, -2);
-
-    Iterator input(_input, win_in);
-    Iterator output(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t data = vld1q_u8(input.ptr());
-
-        const uint16x8x2_t data_u16 =
-        {
-            {
-                vmovl_u8(vget_low_u8(data)),
-                vmovl_u8(vget_high_u8(data))
-            }
-        };
-
-        uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
-
-        vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
-    },
-    input, output);
-}
-
-template <>
-template <>
-inline void NESeparableConvolutionHorKernel<5>::convolve<int16_t>(const Window &window)
-{
-    Window win_in(window);
-    win_in.shift(Window::DimX, -2);
-
-    Iterator input(_input, win_in);
-    Iterator output(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t data = vld1q_u8(input.ptr());
-
-        const int16x8x2_t data_s16 =
-        {
-            {
-                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
-                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
-            }
-        };
-
-        int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
-
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
-    },
-    input, output);
-}
-
-template <>
-template <>
-void NESeparableConvolutionHorKernel<5>::convolve<int32_t>(const Window &window)
-{
-    Window win_in(window);
-    win_in.shift(Window::DimX, -2);
-
-    Iterator input(_input, win_in);
-    Iterator output(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t data = vld1q_u8(input.ptr());
-
-        const int16x8x2_t data_s16 =
-        {
-            {
-                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
-                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
-            }
-        };
-
-        const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
-        const int16x8_t data_s16_m  = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
-        const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
-        const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
-
-        int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[1]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[2]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[3]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[4]);
-
-        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
-
-        int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[1]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[2]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[3]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[4]);
-
-        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
-    },
-    input, output);
-}
-
-template <>
-template <>
-inline void NESeparableConvolutionHorKernel<7>::convolve<uint16_t>(const Window &window)
-{
-    Window win_in(window);
-    win_in.shift(Window::DimX, -3);
-
-    Iterator input(_input, win_in);
-    Iterator output(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t data = vld1q_u8(input.ptr());
-
-        const uint16x8x2_t data_u16 =
-        {
-            {
-                vmovl_u8(vget_low_u8(data)),
-                vmovl_u8(vget_high_u8(data))
-            }
-        };
-
-        uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 5), _conv_row[5]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 6), _conv_row[6]);
-
-        vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
-    },
-    input, output);
-}
-
-template <>
-template <>
-inline void NESeparableConvolutionHorKernel<7>::convolve<int16_t>(const Window &window)
-{
-    Window win_in(window);
-    win_in.shift(Window::DimX, -3);
-
-    Iterator input(_input, win_in);
-    Iterator output(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t data = vld1q_u8(input.ptr());
-
-        const int16x8x2_t data_s16 =
-        {
-            {
-                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
-                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
-            }
-        };
-
-        int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 5), _conv_row[5]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 6), _conv_row[6]);
-
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
-    },
-    input, output);
-}
-
-template <>
-template <>
-void NESeparableConvolutionHorKernel<7>::convolve<int32_t>(const Window &window)
-{
-    Window win_in(window);
-    win_in.shift(Window::DimX, -3);
-
-    Iterator input(_input, win_in);
-    Iterator output(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t data = vld1q_u8(input.ptr());
-
-        const int16x8x2_t data_s16 =
-        {
-            {
-                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
-                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
-            }
-        };
-
-        const int16x8_t data_s16_l2 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
-        const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
-        const int16x8_t data_s16_m  = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
-        const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
-        const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 5);
-        const int16x8_t data_s16_r3 = vextq_s16(data_s16.val[0], data_s16.val[1], 6);
-
-        int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_l2), _conv_row[1]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[2]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[3]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[4]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[5]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r3), _conv_row[6]);
-
-        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
-
-        int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_l2), _conv_row[1]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[2]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[3]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[4]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[5]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r3), _conv_row[6]);
-
-        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
-    },
-    input, output);
-}
-
-template <>
-template <>
-inline void NESeparableConvolutionHorKernel<9>::convolve<uint16_t>(const Window &window)
-{
-    Window win_in(window);
-    win_in.shift(Window::DimX, -4);
-
-    Iterator input(_input, win_in);
-    Iterator output(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t data = vld1q_u8(input.ptr());
-
-        const uint16x8x2_t data_u16 =
-        {
-            {
-                vmovl_u8(vget_low_u8(data)),
-                vmovl_u8(vget_high_u8(data))
-            }
-        };
-
-        uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 5), _conv_row[5]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 6), _conv_row[6]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 7), _conv_row[7]);
-        out            = vmlaq_n_u16(out, data_u16.val[1], _conv_row[8]);
-
-        vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
-    },
-    input, output);
-}
-
-template <>
-template <>
-inline void NESeparableConvolutionHorKernel<9>::convolve<int16_t>(const Window &window)
-{
-    Window win_in(window);
-    win_in.shift(Window::DimX, -4);
-
-    Iterator input(_input, win_in);
-    Iterator output(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t data = vld1q_u8(input.ptr());
-
-        const int16x8x2_t data_s16 =
-        {
-            {
-                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
-                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
-            }
-        };
-
-        int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 5), _conv_row[5]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 6), _conv_row[6]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 7), _conv_row[7]);
-        out           = vmlaq_n_s16(out, data_s16.val[1], _conv_row[8]);
-
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
-    },
-    input, output);
-}
-
-template <>
-template <>
-void NESeparableConvolutionHorKernel<9>::convolve<int32_t>(const Window &window)
-{
-    Window win_in(window);
-    win_in.shift(Window::DimX, -4);
-
-    Iterator input(_input, win_in);
-    Iterator output(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t data = vld1q_u8(input.ptr());
-
-        const int16x8x2_t data_s16 =
-        {
-            {
-                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
-                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
-            }
-        };
-
-        const int16x8_t data_s16_l3 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
-        const int16x8_t data_s16_l2 = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
-        const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
-        const int16x8_t data_s16_m  = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
-        const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 5);
-        const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 6);
-        const int16x8_t data_s16_r3 = vextq_s16(data_s16.val[0], data_s16.val[1], 7);
-
-        int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_l3), _conv_row[1]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_l2), _conv_row[2]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[3]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[4]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[5]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[6]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r3), _conv_row[7]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16.val[1]), _conv_row[8]);
-
-        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
-
-        int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_l3), _conv_row[1]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_l2), _conv_row[2]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[3]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[4]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[5]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[6]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r3), _conv_row[7]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16.val[1]), _conv_row[8]);
-
-        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
-    },
-    input, output);
-}
-
-template class arm_compute::NESeparableConvolutionHorKernel<5>;
-template class arm_compute::NESeparableConvolutionHorKernel<7>;
-template class arm_compute::NESeparableConvolutionHorKernel<9>;
-
-template <unsigned int matrix_size>
-NESeparableConvolutionVertKernel<matrix_size>::NESeparableConvolutionVertKernel()
-    : _conv_col{ { 0 } }, _scale(0)
-{
-}
-
-template <unsigned int matrix_size>
-BorderSize             NESeparableConvolutionVertKernel<matrix_size>::border_size() const
-{
-    return BorderSize{ matrix_size / 2, 0 };
-}
-
-template <unsigned int matrix_size>
-void NESeparableConvolutionVertKernel<matrix_size>::configure(const ITensor *input, ITensor *output, const int16_t *conv_col, uint32_t scale, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv_col);
-
-    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON(scale == 0);
-
-    _input  = input;
-    _output = output;
-    std::copy_n(conv_col, _conv_col.size(), _conv_col.begin());
-    _scale = scale;
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 16;
-
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input->info(), 0, -border_size().top, num_elems_read_per_iteration, matrix_size),
-                              output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
-
-template <unsigned int matrix_size>
-void NESeparableConvolutionVertKernel<matrix_size>::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    switch(_input->info()->data_type())
-    {
-        case DataType::U16:
-            switch(_output->info()->data_type())
-            {
-                case DataType::U8:
-                    convolution_u16<uint8_t>(window);
-                    break;
-                case DataType::S16:
-                    convolution_u16<int16_t>(window);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-            break;
-        case DataType::S16:
-            switch(_output->info()->data_type())
-            {
-                case DataType::U8:
-                    convolution_s16<uint8_t>(window);
-                    break;
-                case DataType::S16:
-                    convolution_s16<int16_t>(window);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-            break;
-        case DataType::S32:
-            switch(_output->info()->data_type())
-            {
-                case DataType::U8:
-                    convolution_s32<uint8_t>(window);
-                    break;
-                case DataType::S16:
-                    convolution_s32<int16_t>(window);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported intermediate data type!");
-            break;
-    }
-}
-
-template <unsigned int matrix_size>
-template <typename OutputType>
-void NESeparableConvolutionVertKernel<matrix_size>::convolution_u16(const Window &win)
-{
-    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
-
-    Window win_in(win);
-    win_in.set_dimension_step(Window::DimX, 8);
-
-    Iterator in(_input, win_in);
-    Iterator out(_output, win);
-
-    std::array<unsigned char *, matrix_size> input_ptrs{ {} };
-    const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
-    const int         k_half       = matrix_size / 2;
-
-    // Set row pointers
-    for(int i = -k_half; i <= k_half; ++i)
-    {
-        input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
-    }
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        uint16x8_t out0 = vdupq_n_u16(0);
-        uint16x8_t out1 = vdupq_n_u16(0);
-
-        // First half
-        for(unsigned int r = 0; r < matrix_size; ++r)
-        {
-            const uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(input_ptrs[r] + in.offset()));
-            out0                  = vmlaq_n_u16(out0, data, _conv_col[r]);
-        }
-
-        in.increment(Window::DimX);
-
-        // Second half
-        for(unsigned int r = 0; r < matrix_size; ++r)
-        {
-            const uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(input_ptrs[r] + in.offset()));
-            out1                  = vmlaq_n_u16(out1, data, _conv_col[r]);
-        }
-
-        //scale the result if needed
-        if(_scale != 1)
-        {
-            float32x4_t out0_f32_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(out0)));
-            float32x4_t out0_f32_low  = vcvtq_f32_u32(vmovl_u16(vget_low_u16(out0)));
-            out0_f32_high             = vmulq_f32(out0_f32_high, oneoverscale);
-            out0_f32_low              = vmulq_f32(out0_f32_low, oneoverscale);
-            store_results(vcvtq_u32_f32(out0_f32_low), vcvtq_u32_f32(out0_f32_high), reinterpret_cast<OutputType *>(out.ptr()));
-
-            float32x4_t out1_f32_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(out1)));
-            float32x4_t out1_f32_low  = vcvtq_f32_u32(vmovl_u16(vget_low_u16(out1)));
-            out1_f32_high             = vmulq_f32(out1_f32_high, oneoverscale);
-            out1_f32_low              = vmulq_f32(out1_f32_low, oneoverscale);
-            store_results(vcvtq_u32_f32(out1_f32_low), vcvtq_u32_f32(out1_f32_high), reinterpret_cast<OutputType *>(out.ptr()) + 8);
-        }
-        else
-        {
-            store_results(out0, out1, reinterpret_cast<OutputType *>(out.ptr()));
-        }
-    },
-    in, out);
-}
-
-template <unsigned int matrix_size>
-template <typename OutputType>
-void NESeparableConvolutionVertKernel<matrix_size>::convolution_s16(const Window &win)
-{
-    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
-
-    Window win_in(win);
-    win_in.set_dimension_step(Window::DimX, 8);
-
-    Iterator in(_input, win_in);
-    Iterator out(_output, win);
-
-    std::array<unsigned char *, matrix_size> input_ptrs{ {} };
-    const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
-    const int         k_half       = matrix_size / 2;
-
-    // Set row pointers
-    for(int i = -k_half; i <= k_half; ++i)
-    {
-        input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
-    }
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int16x8_t out0 = vdupq_n_s16(0);
-        int16x8_t out1 = vdupq_n_s16(0);
-
-        // First half
-        for(unsigned int r = 0; r < matrix_size; ++r)
-        {
-            const int16x8_t data = vld1q_s16(reinterpret_cast<const int16_t *>(input_ptrs[r] + in.offset()));
-            out0                 = vmlaq_n_s16(out0, data, _conv_col[r]);
-        }
-
-        in.increment(Window::DimX);
-
-        // Second half
-        for(unsigned int r = 0; r < matrix_size; ++r)
-        {
-            const int16x8_t data = vld1q_s16(reinterpret_cast<const int16_t *>(input_ptrs[r] + in.offset()));
-            out1                 = vmlaq_n_s16(out1, data, _conv_col[r]);
-        }
-
-        //scale the result if needed
-        if(_scale != 1)
-        {
-            float32x4_t out0_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out0)));
-            float32x4_t out0_f32_low  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out0)));
-            out0_f32_high             = vmulq_f32(out0_f32_high, oneoverscale);
-            out0_f32_low              = vmulq_f32(out0_f32_low, oneoverscale);
-            store_results(vcvtq_s32_f32(out0_f32_low), vcvtq_s32_f32(out0_f32_high), reinterpret_cast<OutputType *>(out.ptr()));
-
-            float32x4_t out1_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out1)));
-            float32x4_t out1_f32_low  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out1)));
-            out1_f32_high             = vmulq_f32(out1_f32_high, oneoverscale);
-            out1_f32_low              = vmulq_f32(out1_f32_low, oneoverscale);
-            store_results(vcvtq_s32_f32(out1_f32_low), vcvtq_s32_f32(out1_f32_high), reinterpret_cast<OutputType *>(out.ptr()) + 8);
-        }
-        else
-        {
-            store_results(out0, out1, reinterpret_cast<OutputType *>(out.ptr()));
-        }
-    },
-    in, out);
-}
-
-template <unsigned int matrix_size>
-template <typename OutputType>
-void NESeparableConvolutionVertKernel<matrix_size>::convolution_s32(const Window &win)
-{
-    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
-
-    Window win_in(win);
-    win_in.set_dimension_step(Window::DimX, 8);
-
-    Iterator in(_input, win_in);
-    Iterator out(_output, win);
-
-    std::array<unsigned char *, matrix_size> input_ptrs{ {} };
-    const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
-    const int         k_half       = matrix_size / 2;
-
-    // Set row pointers
-    for(int i = -k_half; i <= k_half; ++i)
-    {
-        input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
-    }
-
-    const int32x4_t zero = vdupq_n_s32(0);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int32x4x2_t out0 =
-        {
-            {
-                zero,
-                zero
-            }
-        };
-
-        int32x4x2_t out1 =
-        {
-            {
-                zero,
-                zero
-            }
-        };
-
-        // First half
-        for(unsigned int r = 0; r < matrix_size; ++r)
-        {
-            const int32x4x2_t data = vld2q_s32(reinterpret_cast<const int32_t *>(input_ptrs[r] + in.offset()));
-            out0.val[0]            = vmlaq_n_s32(out0.val[0], data.val[0], _conv_col[r]);
-            out0.val[1]            = vmlaq_n_s32(out0.val[1], data.val[1], _conv_col[r]);
-        }
-
-        in.increment(Window::DimX);
-
-        // Second half
-        for(unsigned int r = 0; r < matrix_size; ++r)
-        {
-            const int32x4x2_t data = vld2q_s32(reinterpret_cast<const int32_t *>(input_ptrs[r] + in.offset()));
-            out1.val[0]            = vmlaq_n_s32(out1.val[0], data.val[0], _conv_col[r]);
-            out1.val[1]            = vmlaq_n_s32(out1.val[1], data.val[1], _conv_col[r]);
-        }
-
-        //scale the result if needed
-        if(_scale != 1)
-        {
-            float32x4_t out0_f32_odd  = vcvtq_f32_s32(out0.val[0]);
-            float32x4_t out0_f32_even = vcvtq_f32_s32(out0.val[1]);
-            out0_f32_odd              = vmulq_f32(out0_f32_odd, oneoverscale);
-            out0_f32_even             = vmulq_f32(out0_f32_even, oneoverscale);
-            out0.val[0]               = vcvtq_s32_f32(out0_f32_odd);
-            out0.val[1]               = vcvtq_s32_f32(out0_f32_even);
-
-            float32x4_t out1_f32_odd  = vcvtq_f32_s32(out1.val[0]);
-            float32x4_t out1_f32_even = vcvtq_f32_s32(out1.val[1]);
-            out1_f32_odd              = vmulq_f32(out1_f32_odd, oneoverscale);
-            out1_f32_even             = vmulq_f32(out1_f32_even, oneoverscale);
-            out1.val[0]               = vcvtq_s32_f32(out1_f32_odd);
-            out1.val[1]               = vcvtq_s32_f32(out1_f32_even);
-        }
-
-        const int32x4x2_t out0_s32 = vzipq_s32(out0.val[0], out0.val[1]);
-        store_results(out0_s32.val[0], out0_s32.val[1], reinterpret_cast<OutputType *>(out.ptr()));
-
-        const int32x4x2_t out1_s32 = vzipq_s32(out1.val[0], out1.val[1]);
-        store_results(out1_s32.val[0], out1_s32.val[1], reinterpret_cast<OutputType *>(out.ptr()) + 8);
-    },
-    in, out);
-}
-
-template class arm_compute::NESeparableConvolutionVertKernel<5>;
-template class arm_compute::NESeparableConvolutionVertKernel<7>;
-template class arm_compute::NESeparableConvolutionVertKernel<9>;
-
-/****************************************************************************************\
- *                                 Rectangle Convolution                                *
-\****************************************************************************************/
-
-NEConvolutionRectangleKernel::NEConvolutionRectangleKernel()
-    : _input(nullptr), _output(nullptr), _scale(0), _convolution(), _border_size(), _func_idx(0)
-{
-}
-
-BorderSize NEConvolutionRectangleKernel::border_size() const
-{
-    return _border_size;
-}
-
-void NEConvolutionRectangleKernel::configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv);
-
-    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON(width != 3 && width != 5 && width != 7 && width != 9);
-    ARM_COMPUTE_ERROR_ON(height != 3 && height != 5 && height != 7 && height != 9);
-    ARM_COMPUTE_ERROR_ON(0 == scale);
-
-    _input       = input;
-    _output      = output;
-    _scale       = scale;
-    _border_size = BorderSize(height / 2, width / 2);
-
-    // Setup the convolution matrix
-    const uint32_t nr_elements = width * height;
-    _convolution.resize(nr_elements);
-    std::copy_n(conv, nr_elements, _convolution.begin());
-
-    // Set function index to help choose appropriate function in run()
-    _func_idx = get_index(height) * 4 + get_index(width);
-    ARM_COMPUTE_ERROR_ON(_func_idx > (_nr_supported_sizes * _nr_supported_sizes));
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, _border_size);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, height),
-                              output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, _border_size);
-
-    INEKernel::configure(win);
-}
-
-void NEConvolutionRectangleKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    using ConvolutionRectangleFunction = void (NEConvolutionRectangleKernel::*)(const Window & window);
-
-    // uint8_t function table
-    static const std::array<ConvolutionRectangleFunction, 16> func_table_u8 =
-    {
-        {
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 3>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 5>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 7>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 9>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 3>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 5>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 7>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 9>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 3>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 5>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 7>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 9>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 3>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 5>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 7>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 9>
-        }
-    };
-    // int16_t function table
-    static const std::array<ConvolutionRectangleFunction, 16> func_table_s16 =
-    {
-        {
-            &NEConvolutionRectangleKernel::convolution<int16_t, 3, 3>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 3, 5>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 3, 7>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 3, 9>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 5, 3>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 5, 5>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 5, 7>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 5, 9>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 7, 3>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 7, 5>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 7, 7>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 7, 9>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 9, 3>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 9, 5>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 9, 7>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 9, 9>
-        }
-    };
-
-    // Run appropriate function
-    switch(_output->info()->data_type())
-    {
-        case DataType::U8:
-            ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_u8.size());
-            (this->*func_table_u8[_func_idx])(window);
-            break;
-        case DataType::S16:
-            ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_s16.size());
-            (this->*func_table_s16[_func_idx])(window);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-    }
-}
-
-unsigned int NEConvolutionRectangleKernel::get_index(uint32_t val)
-{
-    switch(val)
-    {
-        case 3:
-            return 0;
-        case 5:
-            return 1;
-        case 7:
-            return 2;
-        case 9:
-            return 3;
-        default:
-            ARM_COMPUTE_ERROR("Not supported dimension size");
-            return 0;
-    }
-}
-
-template <typename OutputType, unsigned int rows, unsigned int cols>
-void NEConvolutionRectangleKernel::convolution(const Window &win)
-{
-    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
-    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
-
-    Iterator input(_input, win);
-    Iterator output(_output, win);
-
-    std::array<unsigned char *, rows> input_ptrs{ {} };
-    const int16_t    *conv       = _convolution.data();
-    const float32x4_t scale_val  = vdupq_n_f32(1.0f / _scale);
-    const int         k_row_half = rows / 2;
-    const int         k_col_half = cols / 2;
-
-    // Set row pointers
-    for(int i = -k_row_half; i <= k_row_half; ++i)
-    {
-        input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
-    }
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int32x4_t out  = vdupq_n_s32(0);
-        int32x4_t out2 = vdupq_n_s32(0);
-
-        // Perform appropriate convolution
-        for(unsigned int r = 0; r < rows; ++r)
-        {
-            const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset());
-            if(3 == cols)
-            {
-                convolve_row3x1(out, out2, data, conv + r * cols);
-            }
-            else if(5 == cols)
-            {
-                convolve_row5x1(out, out2, data, conv + r * cols);
-            }
-            else if(7 == cols)
-            {
-                convolve_row7x1(out, out2, data, conv + r * cols);
-            }
-            else if(9 == cols)
-            {
-                convolve_row9x1(out, out2, data, conv + r * cols);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported number of columns");
-            }
-        }
-
-        // Apply scale
-        if(_scale != 1)
-        {
-            // Convert to F32, scale and convert back to S32
-            out  = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
-            out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
-        }
-
-        // Clamp and store as U8 or S16:
-        store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
-    },
-    input, output);
-}
-} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEConvolutionKernel.h b/src/core/NEON/kernels/NEConvolutionKernel.h
deleted file mode 100644
index b8bf1d1..0000000
--- a/src/core/NEON/kernels/NEConvolutionKernel.h
+++ /dev/null

@@ -1,299 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECONVOLUTIONKERNEL_H
-#define ARM_COMPUTE_NECONVOLUTIONKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/NEON/INESimpleKernel.h"
-
-#include <array>
-#include <cstdint>
-#include <vector>
-
-namespace arm_compute
-{
-class ITensor;
-
-/****************************************************************************************\
- *                                    Square Convolution                                *
-\****************************************************************************************/
-
-/** Interface for the kernel to run an arbitrary size convolution on a tensor. (Currently supports 3x3, 5x5, 7x7 and 9x9).
- * The client can supply a convolution matrix \f$ C_{m,n} \f$.
- * @f{eqnarray}{
- *  k_0 &=& \frac{m}{2}  \\
- *  l_0 &=& \frac{n}{2}  \\
- *  sum &=& \sum_{k=0,l=0}^{k=m-1,l=n-1} input(x+k-k_0, y+l-l_0) C_{k,l}
- *  @f}
- *
- * @note The above equation for this function is similar to the default OpenCV Filter2D function,
- *       which actually computes a correlation and not a convolution.
- *       In case of a real convolution the convolution matrix should be flipped both horizontally and vertically.
- */
-template <unsigned int matrix_size>
-class NEConvolutionKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEConvolutionKernel";
-    }
-    /** Default constructor */
-    NEConvolutionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NEConvolutionKernel(const NEConvolutionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NEConvolutionKernel &operator=(const NEConvolutionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEConvolutionKernel(NEConvolutionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEConvolutionKernel &operator=(NEConvolutionKernel &&) = default;
-    /** Default destructor */
-    ~NEConvolutionKernel() = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output           Destination tensor. Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    template <typename OutputType>
-    void convolution(const Window &win);
-
-protected:
-    uint32_t _scale;                                             /**< scale of the convolution */
-    std::array<int16_t, matrix_size *matrix_size> _convolution;  /**< convolution matrix */
-};
-
-/** Interface for the kernel which applied a 3x3 convolution to a tensor.*/
-using NEConvolution3x3Kernel = NEConvolutionKernel<3>;
-/** Interface for the kernel which applied a 5x5 convolution to a tensor.*/
-using NEConvolution5x5Kernel = NEConvolutionKernel<5>;
-/** Interface for the kernel which applied a 7x7 convolution to a tensor.*/
-using NEConvolution7x7Kernel = NEConvolutionKernel<7>;
-///** Interface for the kernel which applied a 9x9 convolution to a tensor.*/
-using NEConvolution9x9Kernel = NEConvolutionKernel<9>;
-
-/****************************************************************************************\
- *                              Separable Square Convolution                            *
-\****************************************************************************************/
-
-/** Kernel for the Horizontal pass of a Separable Convolution */
-template <unsigned int matrix_size>
-class NESeparableConvolutionHorKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESeparableConvolutionHorKernel";
-    }
-    /** Default constructor */
-    NESeparableConvolutionHorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NESeparableConvolutionHorKernel(const NESeparableConvolutionHorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NESeparableConvolutionHorKernel &operator=(const NESeparableConvolutionHorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NESeparableConvolutionHorKernel(NESeparableConvolutionHorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NESeparableConvolutionHorKernel &operator=(NESeparableConvolutionHorKernel &&) = default;
-    /** Default destructor */
-    ~NESeparableConvolutionHorKernel() = default;
-
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output           Destination tensor. Data types supported: U16, S16, S32.
-     * @param[in]  conv_row         Convolution matrix to apply to the input tensor.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, const int16_t *conv_row, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Apply the object's convolution to the given window of the input tensor..
-     *
-     * @param[in] window Window to apply the convolution on.
-     */
-    template <typename OutputType>
-    void convolve(const Window &window);
-
-    std::array<int16_t, matrix_size> _conv_row; /**< Convolution coefficients */
-    BorderSize _border_size;                    /**< Border size */
-};
-
-/** Interface for the kernel which applied a 5x1 horizontal convolution to a tensor.*/
-using NESeparableConvolution5x5HorKernel = NESeparableConvolutionHorKernel<5>;
-/** Interface for the kernel which applied a 7x1 horizontal convolution to a tensor.*/
-using NESeparableConvolution7x7HorKernel = NESeparableConvolutionHorKernel<7>;
-/** Interface for the kernel which applied a 9x1 horizontal convolution to a tensor.*/
-using NESeparableConvolution9x9HorKernel = NESeparableConvolutionHorKernel<9>;
-
-/** Kernel for the Vertical pass of a Separable Convolution */
-template <unsigned int matrix_size>
-class NESeparableConvolutionVertKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESeparableConvolutionVertKernel";
-    }
-    /** Default constructor */
-    NESeparableConvolutionVertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NESeparableConvolutionVertKernel(const NESeparableConvolutionVertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NESeparableConvolutionVertKernel &operator=(const NESeparableConvolutionVertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NESeparableConvolutionVertKernel(NESeparableConvolutionVertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NESeparableConvolutionVertKernel &operator=(NESeparableConvolutionVertKernel &&) = default;
-    /** Default destructor */
-    ~NESeparableConvolutionVertKernel() = default;
-
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U16, S16, S32.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv_col         Convolution matrix to apply to the input tensor.
-     * @param[in]  scale            Scale of the convolution matrix
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, const int16_t *conv_col, uint32_t scale, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Apply the object's convolution to the given window of the input tensor.
-     *  This function is used if the intermediate values have been stored as U16.
-     *
-     * @param[in] win Window to apply the convolution on.
-     */
-    template <typename OutputType>
-    void convolution_u16(const Window &win);
-    /** Apply the object's convolution to the given window of the input tensor.
-     *  This function is used if the intermediate values have been stored as S16.
-     *
-     * @param[in] win Window to apply the convolution on.
-     */
-    template <typename OutputType>
-    void convolution_s16(const Window &win);
-    /** Apply the object's convolution to the given window of the input tensor.
-     *  This function is used if the intermediate values have been stored as S32.
-     *
-     * @param[in] win Window to apply the convolution on.
-     */
-    template <typename OutputType>
-    void convolution_s32(const Window &win);
-
-    std::array<int16_t, matrix_size> _conv_col; /**< Convolution coefficients */
-    uint32_t _scale;                            /**< Convolution's scale */
-};
-
-/** Interface for the kernel which applied a 1x5 vertical convolution to a tensor.*/
-using NESeparableConvolution5x5VertKernel = NESeparableConvolutionVertKernel<5>;
-/** Interface for the kernel which applied a 1x7 vertical convolution to a tensor.*/
-using NESeparableConvolution7x7VertKernel = NESeparableConvolutionVertKernel<7>;
-/** Interface for the kernel which applied a 1x9 vertical convolution to a tensor.*/
-using NESeparableConvolution9x9VertKernel = NESeparableConvolutionVertKernel<9>;
-
-/****************************************************************************************\
- *                                 Rectangle Convolution                                *
-\****************************************************************************************/
-
-/** Kernel for the running convolution on a rectangle matrix.
- *
- * @note Supports combinations of 3,5,7 and 9.
- */
-class NEConvolutionRectangleKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEConvolutionRectangleKernel";
-    }
-    /** Default constructor */
-    NEConvolutionRectangleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEConvolutionRectangleKernel(NEConvolutionRectangleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEConvolutionRectangleKernel &operator=(NEConvolutionRectangleKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEConvolutionRectangleKernel(NEConvolutionRectangleKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEConvolutionRectangleKernel &operator=(NEConvolutionRectangleKernel &&) = default;
-    /** Default destructor */
-    ~NEConvolutionRectangleKernel() = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  width            Width of convolution matrix (Number of columns)
-     * @param[in]  height           Height of convolution matrix (Number of rows)
-     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    unsigned int get_index(uint32_t val);
-    /** Apply the object's convolution to the given window of the input tensor.
-     *
-     * @param[in] win Window to apply the convolution on.
-     */
-    template <typename OutputType, unsigned int rows, unsigned int cols>
-    void convolution(const Window &win);
-
-protected:
-    const ITensor            *_input;       /**< Input tensor */
-    ITensor                  *_output;      /**< Output tensor */
-    uint32_t                  _scale;       /**< Scale of the convolution */
-    std::vector<int16_t>      _convolution; /**< Convolution matrix */
-    BorderSize                _border_size; /**< Calculated border width */
-    uint32_t                  _func_idx;    /**< Index used to specify convolution function to be used */
-    const static unsigned int _nr_supported_sizes
-    {
-        4
-    }; /**< Number of supported permutations */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NECONVOLUTIONKERNEL_H */

diff --git a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
deleted file mode 100644
index 9f5dfcd..0000000
--- a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
+++ /dev/null

@@ -1,516 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-
-using namespace arm_compute;
-
-namespace arm_compute
-{
-class Coordinates;
-} // namespace arm_compute
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-namespace fp16
-{
-inline void mask_top(const float16x8_t &vc, const float16x8_t &in0, const float16x8_t &in1, uint16x8_t &mask)
-{
-    // vc > nc.val[0], vc > nc.val[1], vc > nc.val[2]
-    mask = vandq_u16(mask, vcgeq_f16(vc, in0));
-    mask = vandq_u16(mask, vcgeq_f16(vc, vextq_f16(in0, in1, 1)));
-    mask = vandq_u16(mask, vcgeq_f16(vc, vextq_f16(in0, in1, 2)));
-}
-
-inline void mask_middle(const float16x8_t &vc, const float16x8_t &in0, const float16x8_t &in1, uint16x8_t &mask)
-{
-    // vc >= nc.val[0], vc > nc.val[2]
-    mask = vandq_u16(mask, vcgeq_f16(vc, in0));
-    mask = vandq_u16(mask, vcgtq_f16(vc, vextq_f16(in0, in1, 2)));
-}
-
-inline void mask_bottom(const float16x8_t &vc, const float16x8_t &in0, const float16x8_t &in1, uint16x8_t &mask)
-{
-    // vc > nc.val[0], vc > nc.val[1], vc > nc.val[2]
-    mask = vandq_u16(mask, vcgtq_f16(vc, in0));
-    mask = vandq_u16(mask, vcgtq_f16(vc, vextq_f16(in0, in1, 1)));
-    mask = vandq_u16(mask, vcgtq_f16(vc, vextq_f16(in0, in1, 2)));
-}
-
-inline void non_maxima_suppression3x3_F32_F32(const void *__restrict in_ptr, void *__restrict out_ptr, const uint32_t in_stride)
-{
-    auto       in  = static_cast<const float *__restrict>(in_ptr) - 1;
-    const auto out = static_cast<float *__restrict>(out_ptr);
-
-    // Get centre scores
-    const float16x8x2_t vc =
-    {
-        vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 1)), vcvt_f16_f32(vld1q_f32(in + 5))),
-        vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 9)), vcvt_f16_f32(vld1q_f32(in + 13)))
-    };
-
-    // Neighboring pixels
-    in -= in_stride;
-
-    static const float16x4_t  zero_f16x4 = vdup_n_f16(0);
-    static const uint16x8_t   zero_u16   = vdupq_n_u16(0);
-    static const uint16x8_t   true_mask  = vceqq_u16(zero_u16, zero_u16);
-    static const uint16x8x2_t true_mask_x2 =
-    {
-        true_mask,
-        true_mask
-    };
-
-    uint16x8x2_t mask = true_mask_x2;
-
-    // Top row
-    const float16x8_t tmp_top0 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in)), vcvt_f16_f32(vld1q_f32(in + 4)));
-    const float16x8_t tmp_top1 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 8)), vcvt_f16_f32(vld1q_f32(in + 12)));
-    const float16x8_t tmp_top2 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 16)), zero_f16x4);
-
-    // vc >= nc.val[0], vc >= nc.val[1], vc >= nc.val[2]
-    mask_top(vc.val[0], tmp_top0, tmp_top1, mask.val[0]);
-    mask_top(vc.val[1], tmp_top1, tmp_top2, mask.val[1]);
-
-    in += in_stride;
-
-    // Middle row
-    const float16x8_t tmp_mid0 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in)), vcvt_f16_f32(vld1q_f32(in + 4)));
-    const float16x8_t tmp_mid1 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 8)), vcvt_f16_f32(vld1q_f32(in + 12)));
-    const float16x8_t tmp_mid2 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 16)), zero_f16x4);
-
-    // vc >= nc.val[0], vc > nc.val[2]
-    mask_middle(vc.val[0], tmp_mid0, tmp_mid1, mask.val[0]);
-    mask_middle(vc.val[1], tmp_mid1, tmp_mid2, mask.val[1]);
-
-    in += in_stride;
-
-    // Bottom row
-    const float16x8_t tmp_bot0 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in)), vcvt_f16_f32(vld1q_f32(in + 4)));
-    const float16x8_t tmp_bot1 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 8)), vcvt_f16_f32(vld1q_f32(in + 12)));
-    const float16x8_t tmp_bot2 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 16)), zero_f16x4);
-
-    // vc > nc.val[0], vc > nc.val[1], vc > nc.val[2]
-    mask_bottom(vc.val[0], tmp_bot0, tmp_bot1, mask.val[0]);
-    mask_bottom(vc.val[1], tmp_bot1, tmp_bot2, mask.val[1]);
-
-    // Store
-    static const float16x8_t zero_f16x8 = vdupq_n_f16(0);
-
-    const float16x8_t suppressed0 = vbslq_f16(mask.val[0], vc.val[0], zero_f16x8);
-    vst1q_f32(out + 0, vcvt_f32_f16(vget_low_f16(suppressed0)));
-    vst1q_f32(out + 4, vcvt_f32_f16(vget_high_f16(suppressed0)));
-
-    const float16x8_t suppressed1 = vbslq_f16(mask.val[1], vc.val[1], zero_f16x8);
-    vst1q_f32(out + 8, vcvt_f32_f16(vget_low_f16(suppressed1)));
-    vst1q_f32(out + 12, vcvt_f32_f16(vget_high_f16(suppressed1)));
-}
-
-inline void non_maxima_suppression3x3_U8_U8(const void *__restrict in_ptr, void *__restrict out_ptr, const uint32_t in_stride)
-{
-    auto       in  = static_cast<const uint8_t *__restrict>(in_ptr) - 1;
-    const auto out = static_cast<uint8_t *__restrict>(out_ptr);
-
-    // Get centre scores
-    const uint8x16_t vc = vld1q_u8(in + 1);
-
-    // Neighboring pixels
-    in -= in_stride;
-
-    // Top row
-    const uint8x16_t l_nc_0 = vld1q_u8(in);
-    const uint8x16_t m_nc_0 = vld1q_u8(in + 1);
-    const uint8x16_t r_nc_0 = vld1q_u8(in + 2);
-
-    // Keep center scores if ...
-    // vc >= l_nc_0, vc >= m_nc_0, vc >= r_nc_0
-    uint8x16_t mask = vcgeq_u8(vc, l_nc_0);
-    mask            = vandq_u8(mask, vcgeq_u8(vc, m_nc_0));
-    mask            = vandq_u8(mask, vcgeq_u8(vc, r_nc_0));
-
-    in += in_stride;
-
-    // Middle row
-    const uint8x16_t l_nc_1 = vld1q_u8(in);
-    const uint8x16_t r_nc_1 = vld1q_u8(in + 2);
-
-    // ... and ...
-    // vc >= l_nc_1, vc > r_nc_1
-    mask = vandq_u8(mask, vcgeq_u8(vc, l_nc_1));
-    mask = vandq_u8(mask, vcgtq_u8(vc, r_nc_1));
-
-    in += in_stride;
-
-    // Bottom row
-    const uint8x16_t l_nc_2 = vld1q_u8(in);
-    const uint8x16_t m_nc_2 = vld1q_u8(in + 1);
-    const uint8x16_t r_nc_2 = vld1q_u8(in + 2);
-
-    // ... and ...
-    // vc > l_nc_2, vc > m_nc_2, vc > r_nc_2
-    mask = vandq_u8(mask, vcgtq_u8(vc, l_nc_2));
-    mask = vandq_u8(mask, vcgtq_u8(vc, m_nc_2));
-    mask = vandq_u8(mask, vcgtq_u8(vc, r_nc_2));
-
-    // Store
-    static const uint8x16_t zero = vdupq_n_u8(0);
-    vst1q_u8(out, vbslq_u8(mask, vc, zero));
-}
-} // namespace fp16
-
-void NENonMaximaSuppression3x3FP16Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-    _input  = input;
-    _output = output;
-
-    switch(input->info()->data_type())
-    {
-        case DataType::U8:
-            _func = &fp16::non_maxima_suppression3x3_U8_U8;
-            break;
-        default:
-            _func = &fp16::non_maxima_suppression3x3_F32_F32;
-            break;
-    }
-
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-    const unsigned int     num_elems_read_per_iteration      = 16 + 2 * border_size().left + (input->info()->data_type() == DataType::U8 ? 0 : 3);
-    constexpr unsigned int num_elems_written_per_iteration   = 16;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-namespace
-{
-inline void non_maxima_suppression3x3_FLOAT_FLOAT(const void *__restrict input_ptr, void *__restrict output_ptr, const uint32_t input_stride)
-{
-    auto       input  = static_cast<const float *__restrict>(input_ptr) - 1;
-    const auto output = static_cast<float *__restrict>(output_ptr);
-
-    // Get centre scores
-    const float32x4x4_t vc =
-    {
-        {
-            vld1q_f32(input + 1),
-            vld1q_f32(input + 5),
-            vld1q_f32(input + 9),
-            vld1q_f32(input + 13)
-        }
-    };
-
-    // Neighboring pixels
-    float32x4x4_t l_nc{ {} };
-    float32x4x4_t m_nc{ {} };
-    float32x4x4_t r_nc{ {} };
-
-    input -= input_stride;
-
-    // Row0 - Low part
-    float32x4_t tmp_low   = vld1q_f32(input);
-    float32x4_t tmp_high  = vld1q_f32(input + 4);
-    float32x4_t tmp_high1 = vld1q_f32(input + 8);
-
-    l_nc.val[0] = tmp_low;
-    m_nc.val[0] = vextq_f32(tmp_low, tmp_high, 1);
-    r_nc.val[0] = vextq_f32(tmp_low, tmp_high, 2);
-
-    tmp_low  = tmp_high;
-    tmp_high = tmp_high1;
-
-    l_nc.val[1] = tmp_low;
-    m_nc.val[1] = vextq_f32(tmp_low, tmp_high, 1);
-    r_nc.val[1] = vextq_f32(tmp_low, tmp_high, 2);
-
-    // Row0 - High part
-    tmp_low   = tmp_high1;
-    tmp_high  = vld1q_f32(input + 12);
-    tmp_high1 = vld1q_f32(input + 16);
-
-    l_nc.val[2] = tmp_low;
-    m_nc.val[2] = vextq_f32(tmp_low, tmp_high, 1);
-    r_nc.val[2] = vextq_f32(tmp_low, tmp_high, 2);
-
-    tmp_low  = tmp_high;
-    tmp_high = tmp_high1;
-
-    l_nc.val[3] = tmp_low;
-    m_nc.val[3] = vextq_f32(tmp_low, tmp_high, 1);
-    r_nc.val[3] = vextq_f32(tmp_low, tmp_high, 2);
-
-    // mc >= nc.val[0], mc >= nc.val[1], mc >= nc.val[2]
-    uint32x4x4_t mask{ {} };
-    mask.val[0] = vcgeq_f32(vc.val[0], l_nc.val[0]);
-    mask.val[0] = vandq_u32(mask.val[0], vcgeq_f32(vc.val[0], m_nc.val[0]));
-    mask.val[0] = vandq_u32(mask.val[0], vcgeq_f32(vc.val[0], r_nc.val[0]));
-    mask.val[1] = vcgeq_f32(vc.val[1], l_nc.val[1]);
-    mask.val[1] = vandq_u32(mask.val[1], vcgeq_f32(vc.val[1], m_nc.val[1]));
-    mask.val[1] = vandq_u32(mask.val[1], vcgeq_f32(vc.val[1], r_nc.val[1]));
-    mask.val[2] = vcgeq_f32(vc.val[2], l_nc.val[2]);
-    mask.val[2] = vandq_u32(mask.val[2], vcgeq_f32(vc.val[2], m_nc.val[2]));
-    mask.val[2] = vandq_u32(mask.val[2], vcgeq_f32(vc.val[2], r_nc.val[2]));
-    mask.val[3] = vcgeq_f32(vc.val[3], l_nc.val[3]);
-    mask.val[3] = vandq_u32(mask.val[3], vcgeq_f32(vc.val[3], m_nc.val[3]));
-    mask.val[3] = vandq_u32(mask.val[3], vcgeq_f32(vc.val[3], r_nc.val[3]));
-
-    input += input_stride;
-
-    // Row1 - Low part
-    tmp_low   = vld1q_f32(input);
-    tmp_high  = vld1q_f32(input + 4);
-    tmp_high1 = vld1q_f32(input + 8);
-
-    l_nc.val[0] = tmp_low;
-    r_nc.val[0] = vextq_f32(tmp_low, tmp_high, 2);
-
-    tmp_low  = tmp_high;
-    tmp_high = tmp_high1;
-
-    l_nc.val[1] = tmp_low;
-    r_nc.val[1] = vextq_f32(tmp_low, tmp_high, 2);
-
-    // Row1 - High part
-    tmp_low   = tmp_high1;
-    tmp_high  = vld1q_f32(input + 12);
-    tmp_high1 = vld1q_f32(input + 16);
-
-    l_nc.val[2] = tmp_low;
-    r_nc.val[2] = vextq_f32(tmp_low, tmp_high, 2);
-
-    tmp_low  = tmp_high;
-    tmp_high = tmp_high1;
-
-    l_nc.val[3] = tmp_low;
-    r_nc.val[3] = vextq_f32(tmp_low, tmp_high, 2);
-
-    // mc >= nc.val[0], mc > nc.val[2]
-    mask.val[0] = vandq_u32(mask.val[0], vcgeq_f32(vc.val[0], l_nc.val[0]));
-    mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], r_nc.val[0]));
-    mask.val[1] = vandq_u32(mask.val[1], vcgeq_f32(vc.val[1], l_nc.val[1]));
-    mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], r_nc.val[1]));
-    mask.val[2] = vandq_u32(mask.val[2], vcgeq_f32(vc.val[2], l_nc.val[2]));
-    mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], r_nc.val[2]));
-    mask.val[3] = vandq_u32(mask.val[3], vcgeq_f32(vc.val[3], l_nc.val[3]));
-    mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], r_nc.val[3]));
-
-    input += input_stride;
-
-    // Row2 - Low part
-    tmp_low   = vld1q_f32(input);
-    tmp_high  = vld1q_f32(input + 4);
-    tmp_high1 = vld1q_f32(input + 8);
-
-    l_nc.val[0] = tmp_low;
-    m_nc.val[0] = vextq_f32(tmp_low, tmp_high, 1);
-    r_nc.val[0] = vextq_f32(tmp_low, tmp_high, 2);
-
-    tmp_low  = tmp_high;
-    tmp_high = tmp_high1;
-
-    l_nc.val[1] = tmp_low;
-    m_nc.val[1] = vextq_f32(tmp_low, tmp_high, 1);
-    r_nc.val[1] = vextq_f32(tmp_low, tmp_high, 2);
-
-    // Row2 - High part
-    tmp_low   = tmp_high1;
-    tmp_high  = vld1q_f32(input + 12);
-    tmp_high1 = vld1q_f32(input + 16);
-
-    l_nc.val[2] = tmp_low;
-    m_nc.val[2] = vextq_f32(tmp_low, tmp_high, 1);
-    r_nc.val[2] = vextq_f32(tmp_low, tmp_high, 2);
-
-    tmp_low  = tmp_high;
-    tmp_high = tmp_high1;
-
-    l_nc.val[3] = tmp_low;
-    m_nc.val[3] = vextq_f32(tmp_low, tmp_high, 1);
-    r_nc.val[3] = vextq_f32(tmp_low, tmp_high, 2);
-
-    // mc > nc.val[0], mc > nc.val[1], mc > nc.val[2]
-    mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], l_nc.val[0]));
-    mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], m_nc.val[0]));
-    mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], r_nc.val[0]));
-    mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], l_nc.val[1]));
-    mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], m_nc.val[1]));
-    mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], r_nc.val[1]));
-    mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], l_nc.val[2]));
-    mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], m_nc.val[2]));
-    mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], r_nc.val[2]));
-    mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], l_nc.val[3]));
-    mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], m_nc.val[3]));
-    mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], r_nc.val[3]));
-
-    static const float32x4_t zero = vdupq_n_f32(0.f);
-
-    // Store
-    vst1q_f32(output + 0, vbslq_f32(mask.val[0], vc.val[0], zero));
-    vst1q_f32(output + 4, vbslq_f32(mask.val[1], vc.val[1], zero));
-    vst1q_f32(output + 8, vbslq_f32(mask.val[2], vc.val[2], zero));
-    vst1q_f32(output + 12, vbslq_f32(mask.val[3], vc.val[3], zero));
-}
-
-inline void non_maxima_suppression3x3_U8_U8(const void *__restrict input_ptr, void *__restrict output_ptr, const uint32_t input_stride)
-{
-    auto       input  = static_cast<const uint8_t *__restrict>(input_ptr) - 1;
-    const auto output = static_cast<uint8_t *__restrict>(output_ptr);
-
-    // Get centre scores
-    const uint8x16_t vc = vld1q_u8(input + 1);
-
-    // Neighboring pixels
-    uint8x16_t l_nc{};
-    uint8x16_t m_nc{};
-    uint8x16_t r_nc{};
-
-    input -= input_stride;
-
-    // Row0
-    l_nc = vld1q_u8(input);
-    m_nc = vld1q_u8(input + 1);
-    r_nc = vld1q_u8(input + 2);
-
-    // mc >= l_nc, mc >= m_nc, mc >= r_nc
-    uint8x16_t mask = vcgeq_u8(vc, l_nc);
-    mask            = vandq_u8(mask, vcgeq_u8(vc, m_nc));
-    mask            = vandq_u8(mask, vcgeq_u8(vc, r_nc));
-
-    input += input_stride;
-
-    // Row1
-    l_nc = vld1q_u8(input);
-    r_nc = vld1q_u8(input + 2);
-
-    // mc >= l_nc, mc > r_nc
-    mask = vandq_u8(mask, vcgeq_u8(vc, l_nc));
-    mask = vandq_u8(mask, vcgtq_u8(vc, r_nc));
-
-    input += input_stride;
-
-    // Row2
-    l_nc = vld1q_u8(input);
-    m_nc = vld1q_u8(input + 1);
-    r_nc = vld1q_u8(input + 2);
-
-    // mc > l_nc, mc > m_nc, mc > r_nc
-    mask = vandq_u8(mask, vcgtq_u8(vc, l_nc));
-    mask = vandq_u8(mask, vcgtq_u8(vc, m_nc));
-    mask = vandq_u8(mask, vcgtq_u8(vc, r_nc));
-
-    static const uint8x16_t zero = vdupq_n_u8(0);
-
-    // Store
-    vst1q_u8(output, vbslq_u8(mask, vc, zero));
-}
-} // namespace
-
-NENonMaximaSuppression3x3Kernel::NENonMaximaSuppression3x3Kernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr)
-{
-}
-
-BorderSize NENonMaximaSuppression3x3Kernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void NENonMaximaSuppression3x3Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-    _input  = input;
-    _output = output;
-
-    if(input->info()->data_type() == DataType::U8)
-    {
-        _func = &non_maxima_suppression3x3_U8_U8;
-    }
-    else
-    {
-        _func = &non_maxima_suppression3x3_FLOAT_FLOAT;
-    }
-
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-    const unsigned int     num_elems_read_per_iteration      = 16 + 2 * border_size().left + (input->info()->data_type() == DataType::U8 ? 0 : 3);
-    constexpr unsigned int num_elems_written_per_iteration   = 16;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
-
-void NENonMaximaSuppression3x3Kernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-    Iterator input(_input, window);
-    Iterator output(_output, window);
-
-    const size_t input_stride = _input->info()->strides_in_bytes()[1] / element_size_from_data_type(_input->info()->data_type());
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        _func(input.ptr(), output.ptr(), input_stride);
-    },
-    input, output);
-}

diff --git a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
deleted file mode 100644
index 4194dac..0000000
--- a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
+++ /dev/null

@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H
-#define ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface to perform Non-Maxima suppression over a 3x3 window using Neon
- *
- */
-class NENonMaximaSuppression3x3Kernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NENonMaximaSuppression3x3Kernel";
-    }
-    /** Default constructor */
-    NENonMaximaSuppression3x3Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NENonMaximaSuppression3x3Kernel(const NENonMaximaSuppression3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NENonMaximaSuppression3x3Kernel &operator=(const NENonMaximaSuppression3x3Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NENonMaximaSuppression3x3Kernel(NENonMaximaSuppression3x3Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NENonMaximaSuppression3x3Kernel &operator=(NENonMaximaSuppression3x3Kernel &&) = default;
-    /** Default destructor */
-    ~NENonMaximaSuppression3x3Kernel() = default;
-
-    /** Initialise the kernel's sources, destinations and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8/F32
-     * @param[out] output           Destination tensor. Data types supported: same as @p input
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-protected:
-    /** Common signature for all the specialised non-maxima suppression 3x3 functions
-     *
-     * @param[in]  input_ptr    Pointer to the input tensor.
-     * @param[out] output_ptr   Pointer to the output tensor
-     * @param[in]  input_stride Stride of the input tensor
-     */
-    using NonMaxSuppr3x3Function = void(const void *__restrict input_ptr, void *__restrict output_ptr, const uint32_t input_stride);
-
-    NonMaxSuppr3x3Function *_func;   /**< Non-Maxima suppression function to use for the particular tensor types passed to configure() */
-    const ITensor          *_input;  /**< Source tensor */
-    ITensor                *_output; /**< Destination tensor */
-};
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/** Neon kernel to perform Non-Maxima suppression 3x3 with intermediate results in FP16 if the input data type is FP32
- */
-class NENonMaximaSuppression3x3FP16Kernel : public NENonMaximaSuppression3x3Kernel
-{
-public:
-    const char *name() const override
-    {
-        return "NENonMaximaSuppression3x3FP16Kernel";
-    }
-    /** Initialise the kernel's sources, destinations and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8/F32.
-     * @param[out] output           Destination tensor. Data types supported: same as @p input
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-};
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-/** Neon kernel to perform Non-Maxima suppression 3x3 with intermediate results in FP16 if the input data type is FP32 */
-using NENonMaximaSuppression3x3FP16Kernel = NENonMaximaSuppression3x3Kernel;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-} // namespace arm_compute
-#endif /* _ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H */

diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp
new file mode 100644
index 0000000..24d0dd8
--- /dev/null
+++ b/src/core/NEON/kernels/NERemapKernel.cpp

@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/NEON/kernels/NERemapKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/ScaleHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+inline int32x4_t offset_nearest_interpolation(const float *mapx_ptr, const float *mapy_ptr, const float32x4_t &width, const float32x4_t &height, const int32x4_t &stride)
+{
+    const float32x4_t lowerxy = vdupq_n_f32(-1.f);
+
+    float32x4_t x = vld1q_f32(mapx_ptr);
+    float32x4_t y = vld1q_f32(mapy_ptr);
+
+    // Clamp x coordinates
+    x = vmaxq_f32(lowerxy, vminq_f32(x, width));
+    y = vmaxq_f32(lowerxy, vminq_f32(y, height));
+
+    const int32x4_t x_s32 = vcvtq_s32_f32(x);
+    const int32x4_t y_s32 = vcvtq_s32_f32(y);
+
+    return vmlaq_s32(x_s32, y_s32, stride);
+}
+
+} // namespace
+
+NERemapKernel::NERemapKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _map_x(nullptr), _map_y(nullptr)
+{
+}
+
+BorderSize NERemapKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void NERemapKernel::configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
+
+    _input  = input;
+    _output = output;
+    _map_x  = map_x;
+    _map_y  = map_y;
+
+    switch(policy)
+    {
+        case InterpolationPolicy::NEAREST_NEIGHBOR:
+        {
+            _func = &NERemapKernel::remap_nearest;
+            break;
+        }
+        case InterpolationPolicy::BILINEAR:
+        {
+            _func = &NERemapKernel::remap_bilinear;
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Unsupported interpolation mode");
+            break;
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    const int total_right  = ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration);
+    const int access_right = total_right + (((total_right - input->info()->dimension(0)) == 0) ? border_size().right : 0);
+
+    AccessWindowStatic input_access(input->info(), -border_size().left, -border_size().top, access_right, input->info()->dimension(1) + border_size().bottom);
+
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal mapx_access(map_x->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal mapy_access(map_y->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, mapx_access, mapy_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NERemapKernel::remap_nearest(const Window &window)
+{
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+    Iterator mapx(_map_x, window);
+    Iterator mapy(_map_y, window);
+
+    const float32x4_t width     = vdupq_n_f32(static_cast<float>(_input->info()->dimension(0)));
+    const float32x4_t height    = vdupq_n_f32(static_cast<float>(_input->info()->dimension(1)));
+    const int32x4_t   in_stride = vdupq_n_s32(static_cast<int32_t>(_input->info()->strides_in_bytes()[1]));
+
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        const auto     mapx_ptr = reinterpret_cast<const float *>(mapx.ptr());
+        const auto     mapy_ptr = reinterpret_cast<const float *>(mapy.ptr());
+        const uint8_t *in_ptr   = in.ptr();
+
+        const int32x4_t offset0 = offset_nearest_interpolation(mapx_ptr + 0, mapy_ptr + 0, width, height, in_stride);
+        const int32x4_t offset1 = offset_nearest_interpolation(mapx_ptr + 4, mapy_ptr + 4, width, height, in_stride);
+        const int32x4_t offset2 = offset_nearest_interpolation(mapx_ptr + 8, mapy_ptr + 8, width, height, in_stride);
+        const int32x4_t offset3 = offset_nearest_interpolation(mapx_ptr + 12, mapy_ptr + 12, width, height, in_stride);
+
+        uint8x16_t tmp = vdupq_n_u8(0);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 0)], tmp, 0);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 1)], tmp, 1);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 2)], tmp, 2);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 3)], tmp, 3);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 0)], tmp, 4);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 1)], tmp, 5);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 2)], tmp, 6);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 3)], tmp, 7);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 0)], tmp, 8);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 1)], tmp, 9);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 2)], tmp, 10);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 3)], tmp, 11);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 0)], tmp, 12);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 1)], tmp, 13);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 2)], tmp, 14);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 3)], tmp, 15);
+        vst1q_u8(out.ptr(), tmp);
+    },
+    in, out, mapx, mapy);
+}
+
+void NERemapKernel::remap_bilinear(const Window &window)
+{
+    using namespace scale_helpers;
+
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+    Iterator mapx(_map_x, window);
+    Iterator mapy(_map_y, window);
+
+    const size_t width     = _input->info()->dimension(0);
+    const size_t height    = _input->info()->dimension(1);
+    const size_t in_stride = _input->info()->strides_in_bytes()[1];
+
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        const auto     mapx_ptr = reinterpret_cast<float *>(mapx.ptr());
+        const auto     mapy_ptr = reinterpret_cast<float *>(mapy.ptr());
+        const uint8_t *in_ptr   = in.ptr();
+
+        uint8x8_t tmp0 = vdup_n_u8(0);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[0], mapy_ptr[0]), tmp0, 0);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[1], mapy_ptr[1]), tmp0, 1);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[2], mapy_ptr[2]), tmp0, 2);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[3], mapy_ptr[3]), tmp0, 3);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[4], mapy_ptr[4]), tmp0, 4);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[5], mapy_ptr[5]), tmp0, 5);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[6], mapy_ptr[6]), tmp0, 6);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[7], mapy_ptr[7]), tmp0, 7);
+
+        uint8x8_t tmp1 = vdup_n_u8(0);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[8], mapy_ptr[8]), tmp1, 0);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[9], mapy_ptr[9]), tmp1, 1);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[10], mapy_ptr[10]), tmp1, 2);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[11], mapy_ptr[11]), tmp1, 3);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[12], mapy_ptr[12]), tmp1, 4);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[13], mapy_ptr[13]), tmp1, 5);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[14], mapy_ptr[14]), tmp1, 6);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[15], mapy_ptr[15]), tmp1, 7);
+
+        vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
+    },
+    in, out, mapx, mapy);
+}
+
+void NERemapKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}

diff --git a/src/core/NEON/kernels/NERemapKernel.h b/src/core/NEON/kernels/NERemapKernel.h
new file mode 100644
index 0000000..adc7f4b
--- /dev/null
+++ b/src/core/NEON/kernels/NERemapKernel.h

@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEREMAPKERNEL_H
+#define ARM_COMPUTE_NEREMAPKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Neon kernel to perform a remap on a tensor */
+class NERemapKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NERemapKernel";
+    }
+    /** Default constructor */
+    NERemapKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NERemapKernel(const NERemapKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NERemapKernel &operator=(const NERemapKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NERemapKernel(NERemapKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NERemapKernel &operator=(NERemapKernel &&) = default;
+    /** Default destructor */
+    ~NERemapKernel() = default;
+
+    /** Initialize the kernel's input, output and border mode.
+     *
+     * @param[in]  input  Source tensor. Data type supported: U8.
+     * @param[in]  map_x  Map for X coordinates. Data type supported: F32.
+     * @param[in]  map_y  Map for Y coordinates. Data type supported: F32.
+     * @param[out] output Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
+     * @param[in]  policy The interpolation type.
+     */
+    void configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+private:
+    /** function to perform nearest interpolation on the given window */
+    void remap_nearest(const Window &window);
+    /** function to perform bilinear interpolation on the given window */
+    void remap_bilinear(const Window &window);
+    /** Remap function to use for the particular interpolation type passed to configure() */
+    void (NERemapKernel::*_func)(const Window &window);
+
+    const ITensor *_input;  /**< Input image */
+    ITensor       *_output; /**< Output image */
+    const ITensor *_map_x;  /**< Input remap x coordinates */
+    const ITensor *_map_y;  /**< Input remap y coordinates */
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEREMAPKERNEL_H */

diff --git a/src/runtime/CL/functions/CLAbsoluteDifference.cpp b/src/runtime/CL/functions/CLAbsoluteDifference.cpp
deleted file mode 100644
index ff5b0a8..0000000
--- a/src/runtime/CL/functions/CLAbsoluteDifference.cpp
+++ /dev/null

@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLAbsoluteDifference.h"
-
-#include "src/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLAbsoluteDifference::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
-}
-
-void CLAbsoluteDifference::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
-    auto k = std::make_unique<CLAbsoluteDifferenceKernel>();
-    k->configure(compile_context, input1, input2, output);
-    _kernel = std::move(k);
-}

diff --git a/src/runtime/CL/functions/CLAccumulate.cpp b/src/runtime/CL/functions/CLAccumulate.cpp
deleted file mode 100644
index 44020fd..0000000
--- a/src/runtime/CL/functions/CLAccumulate.cpp
+++ /dev/null

@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLAccumulate.h"
-
-#include "src/core/CL/kernels/CLAccumulateKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLAccumulate::configure(const ICLTensor *input, ICLTensor *accum)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, accum);
-}
-
-void CLAccumulate::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *accum)
-{
-    auto k = std::make_unique<CLAccumulateKernel>();
-    k->configure(compile_context, input, accum);
-    _kernel = std::move(k);
-}
-
-void CLAccumulateWeighted::configure(const ICLTensor *input, float alpha, ICLTensor *accum)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, alpha, accum);
-}
-
-void CLAccumulateWeighted::configure(const CLCompileContext &compile_context, const ICLTensor *input, float alpha, ICLTensor *accum)
-{
-    auto k = std::make_unique<CLAccumulateWeightedKernel>();
-    k->configure(compile_context, input, alpha, accum);
-    _kernel = std::move(k);
-}
-
-void CLAccumulateSquared::configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, shift, accum);
-}
-
-void CLAccumulateSquared::configure(const CLCompileContext &compile_context, const ICLTensor *input, uint32_t shift, ICLTensor *accum)
-{
-    auto k = std::make_unique<CLAccumulateSquaredKernel>();
-    k->configure(compile_context, input, shift, accum);
-    _kernel = std::move(k);
-}

diff --git a/src/runtime/CL/functions/CLBox3x3.cpp b/src/runtime/CL/functions/CLBox3x3.cpp
deleted file mode 100644
index 09e24d1..0000000
--- a/src/runtime/CL/functions/CLBox3x3.cpp
+++ /dev/null

@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLBox3x3.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "src/core/CL/kernels/CLBox3x3Kernel.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLBox3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
-}
-
-void CLBox3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = std::make_unique<CLBox3x3Kernel>();
-    k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler->configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
-}

diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp
deleted file mode 100644
index 7e99a1b..0000000
--- a/src/runtime/CL/functions/CLCannyEdge.cpp
+++ /dev/null

@@ -1,214 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLCannyEdge.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
-#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
-#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
-#include "src/core/CL/kernels/CLCannyEdgeKernel.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLSobel5x5Kernel.h"
-#include "src/core/CL/kernels/CLSobel7x7Kernel.h"
-
-using namespace arm_compute;
-
-CLCannyEdge::CLCannyEdge(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _sobel(),
-      _gradient(std::make_unique<CLGradientKernel>()),
-      _border_mag_gradient(std::make_unique<CLFillBorderKernel>()),
-      _non_max_suppr(std::make_unique<CLEdgeNonMaxSuppressionKernel>()),
-      _edge_trace(std::make_unique<CLEdgeTraceKernel>()),
-      _gx(),
-      _gy(),
-      _mag(),
-      _phase(),
-      _nonmax(),
-      _visited(),
-      _recorded(),
-      _l1_list_counter(),
-      _l1_stack(),
-      _output(nullptr)
-{
-}
-
-CLCannyEdge::~CLCannyEdge() = default;
-
-void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode,
-                            uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, upper_thr, lower_thr, gradient_size, norm_type, border_mode, constant_border_value);
-}
-
-void CLCannyEdge::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type,
-                            BorderMode border_mode,
-                            uint8_t    constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON((1 != norm_type) && (2 != norm_type));
-    ARM_COMPUTE_ERROR_ON((gradient_size != 3) && (gradient_size != 5) && (gradient_size != 7));
-    ARM_COMPUTE_ERROR_ON((lower_thr < 0) || (lower_thr >= upper_thr));
-
-    _output = output;
-
-    const unsigned int L1_hysteresis_stack_size = 8;
-    const TensorShape  shape                    = input->info()->tensor_shape();
-
-    TensorInfo gradient_info;
-    TensorInfo info;
-
-    // Initialize images
-    if(gradient_size < 7)
-    {
-        gradient_info.init(shape, 1, arm_compute::DataType::S16);
-        info.init(shape, 1, arm_compute::DataType::U16);
-    }
-    else
-    {
-        gradient_info.init(shape, 1, arm_compute::DataType::S32);
-        info.init(shape, 1, arm_compute::DataType::U32);
-    }
-
-    _gx.allocator()->init(gradient_info);
-    _gy.allocator()->init(gradient_info);
-    _mag.allocator()->init(info);
-    _nonmax.allocator()->init(info);
-
-    TensorInfo info_u8(shape, 1, arm_compute::DataType::U8);
-    _phase.allocator()->init(info_u8);
-    _l1_list_counter.allocator()->init(info_u8);
-
-    TensorInfo info_u32(shape, 1, arm_compute::DataType::U32);
-    _visited.allocator()->init(info_u32);
-    _recorded.allocator()->init(info_u32);
-
-    TensorShape shape_l1_stack = input->info()->tensor_shape();
-    shape_l1_stack.set(0, input->info()->dimension(0) * L1_hysteresis_stack_size);
-    TensorInfo info_s32(shape_l1_stack, 1, arm_compute::DataType::S32);
-    _l1_stack.allocator()->init(info_s32);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_gx);
-    _memory_group.manage(&_gy);
-
-    // Configure/Init sobelNxN
-    if(gradient_size == 3)
-    {
-        auto k = std::make_unique<CLSobel3x3>();
-        k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
-        _sobel = std::move(k);
-    }
-    else if(gradient_size == 5)
-    {
-        auto k = std::make_unique<CLSobel5x5>();
-        k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
-        _sobel = std::move(k);
-    }
-    else if(gradient_size == 7)
-    {
-        auto k = std::make_unique<CLSobel7x7>();
-        k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
-        _sobel = std::move(k);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR_VAR("Gradient size %d not supported", gradient_size);
-    }
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_mag);
-    _memory_group.manage(&_phase);
-
-    // Configure gradient
-    _gradient->configure(compile_context, &_gx, &_gy, &_mag, &_phase, norm_type);
-
-    // Allocate intermediate buffers
-    _gx.allocator()->allocate();
-    _gy.allocator()->allocate();
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_nonmax);
-
-    // Configure non-maxima suppression
-    _non_max_suppr->configure(compile_context, &_mag, &_phase, &_nonmax, lower_thr, border_mode == BorderMode::UNDEFINED);
-
-    // Allocate intermediate buffers
-    _phase.allocator()->allocate();
-
-    // Fill border around magnitude image as non-maxima suppression will access
-    // it. If border mode is undefined filling the border is a nop.
-    _border_mag_gradient->configure(compile_context, &_mag, _non_max_suppr->border_size(), border_mode, constant_border_value);
-
-    // Allocate intermediate buffers
-    _mag.allocator()->allocate();
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_visited);
-    _memory_group.manage(&_recorded);
-    _memory_group.manage(&_l1_stack);
-    _memory_group.manage(&_l1_list_counter);
-
-    // Configure edge tracing
-    _edge_trace->configure(compile_context, &_nonmax, output, upper_thr, lower_thr, &_visited, &_recorded, &_l1_stack, &_l1_list_counter);
-
-    // Allocate intermediate buffers
-    _visited.allocator()->allocate();
-    _recorded.allocator()->allocate();
-    _l1_stack.allocator()->allocate();
-    _l1_list_counter.allocator()->allocate();
-    _nonmax.allocator()->allocate();
-}
-
-void CLCannyEdge::run()
-{
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Run sobel
-    _sobel->run();
-
-    // Run phase and magnitude calculation
-    CLScheduler::get().enqueue(*_gradient, false);
-
-    // Fill border before non-maxima suppression. Nop for border mode undefined.
-    CLScheduler::get().enqueue(*_border_mag_gradient, false);
-
-    // Run non max suppresion
-    _nonmax.clear(CLScheduler::get().queue());
-    CLScheduler::get().enqueue(*_non_max_suppr, false);
-
-    // Clear temporary structures and run edge trace
-    _output->clear(CLScheduler::get().queue());
-    _visited.clear(CLScheduler::get().queue());
-    _recorded.clear(CLScheduler::get().queue());
-    _l1_list_counter.clear(CLScheduler::get().queue());
-    _l1_stack.clear(CLScheduler::get().queue());
-    CLScheduler::get().enqueue(*_edge_trace, true);
-}

diff --git a/src/runtime/CL/functions/CLChannelCombine.cpp b/src/runtime/CL/functions/CLChannelCombine.cpp
deleted file mode 100644
index 543de9c..0000000
--- a/src/runtime/CL/functions/CLChannelCombine.cpp
+++ /dev/null

@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLChannelCombine.h"
-
-#include "src/core/CL/kernels/CLChannelCombineKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLChannelCombine::configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), plane0, plane1, plane2, plane3, output);
-}
-
-void CLChannelCombine::configure(const CLCompileContext &compile_context, const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
-{
-    auto k = std::make_unique<CLChannelCombineKernel>();
-    k->configure(compile_context, plane0, plane1, plane2, plane3, output);
-    _kernel = std::move(k);
-}
-
-void CLChannelCombine::configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), plane0, plane1, plane2, output);
-}
-
-void CLChannelCombine::configure(const CLCompileContext &compile_context, const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
-{
-    auto k = std::make_unique<CLChannelCombineKernel>();
-    k->configure(compile_context, plane0, plane1, plane2, output);
-    _kernel = std::move(k);
-}

diff --git a/src/runtime/CL/functions/CLChannelExtract.cpp b/src/runtime/CL/functions/CLChannelExtract.cpp
deleted file mode 100644
index 645fc05..0000000
--- a/src/runtime/CL/functions/CLChannelExtract.cpp
+++ /dev/null

@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLChannelExtract.h"
-
-#include "src/core/CL/kernels/CLChannelExtractKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLChannelExtract::configure(const ICLTensor *input, Channel channel, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, channel, output);
-}
-
-void CLChannelExtract::configure(const CLCompileContext &compile_context, const ICLTensor *input, Channel channel, ICLTensor *output)
-{
-    auto k = std::make_unique<CLChannelExtractKernel>();
-    k->configure(compile_context, input, channel, output);
-    _kernel = std::move(k);
-}
-
-void CLChannelExtract::configure(const ICLMultiImage *input, Channel channel, ICLImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, channel, output);
-}
-
-void CLChannelExtract::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, Channel channel, ICLImage *output)
-{
-    auto k = std::make_unique<CLChannelExtractKernel>();
-    k->configure(compile_context, input, channel, output);
-    _kernel = std::move(k);
-}

diff --git a/src/runtime/CL/functions/CLColorConvert.cpp b/src/runtime/CL/functions/CLColorConvert.cpp
deleted file mode 100644
index 9aeeb65..0000000
--- a/src/runtime/CL/functions/CLColorConvert.cpp
+++ /dev/null

@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLColorConvert.h"
-
-#include "src/core/CL/kernels/CLColorConvertKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLColorConvert::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    auto k = std::make_unique<CLColorConvertKernel>();
-    k->configure(compile_context, input, output);
-    _kernel = std::move(k);
-}
-
-void CLColorConvert::configure(const ICLImage *input, ICLMultiImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLMultiImage *output)
-{
-    auto k = std::make_unique<CLColorConvertKernel>();
-    k->configure(compile_context, input, output);
-    _kernel = std::move(k);
-}
-
-void CLColorConvert::configure(const ICLMultiImage *input, ICLImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLImage *output)
-{
-    auto k = std::make_unique<CLColorConvertKernel>();
-    k->configure(compile_context, input, output);
-    _kernel = std::move(k);
-}
-
-void CLColorConvert::configure(const ICLMultiImage *input, ICLMultiImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLMultiImage *output)
-{
-    auto k = std::make_unique<CLColorConvertKernel>();
-    k->configure(compile_context, input, output);
-    _kernel = std::move(k);
-}

diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp
deleted file mode 100644
index ffc7cda..0000000
--- a/src/runtime/CL/functions/CLConvolution.cpp
+++ /dev/null

@@ -1,144 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLConvolution.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/ITensorAllocator.h"
-#include "src/core/CL/kernels/CLConvolutionKernel.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLConvolution3x3::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, scale, border_mode, constant_border_value);
-}
-
-void CLConvolution3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode,
-                                 uint8_t constant_border_value)
-{
-    auto k = std::make_unique<CLConvolution3x3Kernel>();
-    k->configure(compile_context, input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
-
-template <unsigned int matrix_size>
-CLConvolutionSquare<matrix_size>::CLConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(std::make_unique<CLSeparableConvolutionHorKernel<matrix_size>>()),
-      _kernel_vert(std::make_unique<CLSeparableConvolutionVertKernel<matrix_size>>()), _kernel(std::make_unique<CLConvolutionKernel<matrix_size>>()), _border_handler(std::make_unique<CLFillBorderKernel>())
-{
-}
-
-template <unsigned int matrix_size>
-CLConvolutionSquare<matrix_size>::~CLConvolutionSquare() = default;
-
-template <unsigned int matrix_size>
-void CLConvolutionSquare<matrix_size>::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode,
-                                                 uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, scale, border_mode, constant_border_value);
-}
-
-template <unsigned int matrix_size>
-void CLConvolutionSquare<matrix_size>::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode,
-                                                 uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(conv == nullptr);
-    std::array<int16_t, matrix_size> conv_col{ 0 };
-    std::array<int16_t, matrix_size> conv_row{ 0 };
-    _is_separable = separate_matrix(conv, conv_col.data(), conv_row.data(), matrix_size);
-
-    if(_is_separable)
-    {
-        std::pair<DataType, DataType> type_pair = data_type_for_convolution(conv_col.data(), conv_row.data(), matrix_size);
-        _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, type_pair.first));
-
-        // Manage intermediate buffers
-        _memory_group.manage(&_tmp);
-
-        if(scale == 0)
-        {
-            scale = calculate_matrix_scale(conv, matrix_size);
-        }
-
-        _kernel_hor->configure(compile_context, input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED);
-        _kernel_vert->configure(compile_context, &_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED, type_pair.second);
-        _border_handler->configure(compile_context, input, _kernel_hor->border_size(), border_mode, PixelValue(constant_border_value));
-
-        // Allocate intermediate buffer
-        _tmp.allocator()->allocate();
-    }
-    else
-    {
-        _kernel->configure(compile_context, input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
-        _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-    }
-}
-
-template <unsigned int matrix_size>
-void                   CLConvolutionSquare<matrix_size>::run()
-{
-    CLScheduler::get().enqueue(*_border_handler);
-
-    if(_is_separable)
-    {
-        MemoryGroupResourceScope scope_mg(_memory_group);
-
-        CLScheduler::get().enqueue(*_kernel_hor, false);
-        CLScheduler::get().enqueue(*_kernel_vert);
-    }
-    else
-    {
-        CLScheduler::get().enqueue(*_kernel);
-    }
-}
-
-template class arm_compute::CLConvolutionSquare<5>;
-template class arm_compute::CLConvolutionSquare<7>;
-template class arm_compute::CLConvolutionSquare<9>;
-
-void CLConvolutionRectangle::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, rows, cols, scale, border_mode, constant_border_value);
-}
-
-void CLConvolutionRectangle::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale,
-                                       BorderMode border_mode, uint8_t constant_border_value)
-{
-    border_mode = (border_mode == BorderMode::UNDEFINED) ? BorderMode::CONSTANT : border_mode;
-    auto k      = std::make_unique<CLConvolutionRectangleKernel>();
-    k->configure(compile_context, input, output, conv, rows, cols, scale, false);
-    _kernel = std::move(k);
-    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}

diff --git a/src/runtime/CL/functions/CLDerivative.cpp b/src/runtime/CL/functions/CLDerivative.cpp
deleted file mode 100644
index 2e3ecf7..0000000
--- a/src/runtime/CL/functions/CLDerivative.cpp
+++ /dev/null

@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLDerivative.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "src/core/CL/kernels/CLDerivativeKernel.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLDerivative::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
-}
-
-void CLDerivative::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = std::make_unique<CLDerivativeKernel>();
-    k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler->configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
-}

diff --git a/src/runtime/CL/functions/CLDilate.cpp b/src/runtime/CL/functions/CLDilate.cpp
deleted file mode 100644
index 92c5cc7..0000000
--- a/src/runtime/CL/functions/CLDilate.cpp
+++ /dev/null

@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLDilate.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "src/core/CL/kernels/CLDilateKernel.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLDilate::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
-}
-
-void CLDilate::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = std::make_unique<CLDilateKernel>();
-    k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler->configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
-}

diff --git a/src/runtime/CL/functions/CLEqualizeHistogram.cpp b/src/runtime/CL/functions/CLEqualizeHistogram.cpp
deleted file mode 100644
index 11607cf..0000000
--- a/src/runtime/CL/functions/CLEqualizeHistogram.cpp
+++ /dev/null

@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLEqualizeHistogram.h"
-
-#include "arm_compute/core/CL/ICLDistribution1D.h"
-#include "arm_compute/core/CL/ICLLut.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLHistogramKernel.h"
-#include "src/core/CL/kernels/CLTableLookupKernel.h"
-
-#include <algorithm>
-#include <cmath>
-#include <cstddef>
-#include <numeric>
-
-using namespace arm_compute;
-
-namespace
-{
-void calculate_cum_dist_and_lut(CLDistribution1D &dist, CLDistribution1D &cum_dist, CLLut &lut)
-{
-    dist.map(true);
-    cum_dist.map(true);
-    lut.map(true);
-
-    const uint32_t *dist_ptr     = dist.buffer();
-    uint32_t       *cum_dist_ptr = cum_dist.buffer();
-    uint8_t        *lut_ptr      = lut.buffer();
-
-    ARM_COMPUTE_ERROR_ON(dist_ptr == nullptr);
-    ARM_COMPUTE_ERROR_ON(cum_dist_ptr == nullptr);
-    ARM_COMPUTE_ERROR_ON(lut_ptr == nullptr);
-
-    // Calculate cumulative distribution
-    std::partial_sum(dist_ptr, dist_ptr + 256, cum_dist_ptr);
-
-    // Get the number of pixels that have the lowest value in the input image
-    const uint32_t num_lowest_pixels = *std::find_if(dist_ptr, dist_ptr + 256, [](const uint32_t &v)
-    {
-        return v > 0;
-    });
-    const size_t image_size = cum_dist_ptr[255];
-
-    if(image_size == num_lowest_pixels)
-    {
-        std::iota(lut_ptr, lut_ptr + 256, 0);
-    }
-    else
-    {
-        const float diff = image_size - num_lowest_pixels;
-
-        for(size_t i = 0; i < 256; ++i)
-        {
-            lut_ptr[i] = lround((cum_dist_ptr[i] - num_lowest_pixels) / diff * 255.f);
-        }
-    }
-
-    dist.unmap();
-    cum_dist.unmap();
-    lut.unmap();
-}
-} // namespace
-
-CLEqualizeHistogram::CLEqualizeHistogram()
-    : _histogram_kernel(std::make_unique<CLHistogramKernel>()),
-      _border_histogram_kernel(std::make_unique<CLHistogramBorderKernel>()),
-      _map_histogram_kernel(std::make_unique<CLTableLookupKernel>()),
-      _hist(nr_bins, 0, max_range),
-      _cum_dist(nr_bins, 0, max_range),
-      _cd_lut(nr_bins, DataType::U8)
-{
-}
-
-CLEqualizeHistogram::~CLEqualizeHistogram() = default;
-
-void CLEqualizeHistogram::configure(const ICLImage *input, ICLImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLEqualizeHistogram::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLImage *output)
-{
-    _histogram_kernel->configure(compile_context, input, &_hist);
-    _border_histogram_kernel->configure(compile_context, input, &_hist);
-    _map_histogram_kernel->configure(compile_context, input, &_cd_lut, output);
-}
-
-void CLEqualizeHistogram::run()
-{
-    // Calculate histogram of input.
-    CLScheduler::get().enqueue(*_histogram_kernel, false);
-
-    // Calculate remaining pixels when image is not multiple of the elements of histogram kernel
-    CLScheduler::get().enqueue(*_border_histogram_kernel, false);
-
-    // Calculate cumulative distribution of histogram and create LUT.
-    calculate_cum_dist_and_lut(_hist, _cum_dist, _cd_lut);
-
-    // Map input to output using created LUT.
-    CLScheduler::get().enqueue(*_map_histogram_kernel);
-}

diff --git a/src/runtime/CL/functions/CLErode.cpp b/src/runtime/CL/functions/CLErode.cpp
deleted file mode 100644
index 29551fc..0000000
--- a/src/runtime/CL/functions/CLErode.cpp
+++ /dev/null

@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLErode.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "src/core/CL/kernels/CLErodeKernel.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLErode::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
-}
-
-void CLErode::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = std::make_unique<CLErodeKernel>();
-    k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler->configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
-}

diff --git a/src/runtime/CL/functions/CLFastCorners.cpp b/src/runtime/CL/functions/CLFastCorners.cpp
deleted file mode 100644
index a3a62d6..0000000
--- a/src/runtime/CL/functions/CLFastCorners.cpp
+++ /dev/null

@@ -1,141 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLFastCorners.h"
-
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/ITensorAllocator.h"
-#include "src/core/CL/kernels/CLFastCornersKernel.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-
-#include <algorithm>
-#include <cstring>
-
-using namespace arm_compute;
-
-CLFastCorners::CLFastCorners(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)),
-      _fast_corners_kernel(std::make_unique<CLFastCornersKernel>()),
-      _suppr_func(),
-      _copy_array_kernel(std::make_unique<CLCopyToArrayKernel>()),
-      _output(),
-      _suppr(),
-      _win(),
-      _non_max(false),
-      _num_corners(nullptr),
-      _num_buffer(),
-      _corners(nullptr),
-      _constant_border_value(0)
-{
-}
-
-CLFastCorners::~CLFastCorners() = default;
-
-void CLFastCorners::configure(const ICLImage *input, float threshold, bool nonmax_suppression, ICLKeyPointArray *corners,
-                              unsigned int *num_corners, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, threshold, nonmax_suppression, corners, num_corners, border_mode, constant_border_value);
-}
-
-void CLFastCorners::configure(const CLCompileContext &compile_context, const ICLImage *input, float threshold, bool nonmax_suppression, ICLKeyPointArray *corners,
-                              unsigned int *num_corners, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON(BorderMode::UNDEFINED != border_mode);
-    ARM_COMPUTE_ERROR_ON(nullptr == corners);
-    ARM_COMPUTE_ERROR_ON(threshold < 1 && threshold > 255);
-
-    TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::U8);
-    _output.allocator()->init(tensor_info);
-
-    _non_max               = nonmax_suppression;
-    _num_corners           = num_corners;
-    _corners               = corners;
-    _num_buffer            = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(unsigned int));
-    _constant_border_value = constant_border_value;
-
-    const bool update_number = (nullptr != _num_corners);
-
-    _memory_group.manage(&_output);
-    _fast_corners_kernel->configure(compile_context, input, &_output, threshold, nonmax_suppression, border_mode);
-
-    if(!_non_max)
-    {
-        _copy_array_kernel->configure(compile_context, &_output, update_number, _corners, &_num_buffer);
-    }
-    else
-    {
-        _suppr.allocator()->init(tensor_info);
-        _memory_group.manage(&_suppr);
-
-        _suppr_func.configure(compile_context, &_output, &_suppr, border_mode);
-        _copy_array_kernel->configure(compile_context, &_suppr, update_number, _corners, &_num_buffer);
-
-        _suppr.allocator()->allocate();
-    }
-
-    // Allocate intermediate tensors
-    _output.allocator()->allocate();
-}
-
-void CLFastCorners::run()
-{
-    cl::CommandQueue q = CLScheduler::get().queue();
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    if(_non_max)
-    {
-        ARM_COMPUTE_ERROR_ON_MSG(_output.cl_buffer().get() == nullptr, "Unconfigured function");
-        const auto out_buffer = static_cast<unsigned char *>(q.enqueueMapBuffer(_output.cl_buffer(), CL_TRUE, CL_MAP_WRITE, 0, _output.info()->total_size()));
-        memset(out_buffer, 0, _output.info()->total_size());
-        q.enqueueUnmapMemObject(_output.cl_buffer(), out_buffer);
-    }
-
-    CLScheduler::get().enqueue(*_fast_corners_kernel, false);
-
-    if(_non_max)
-    {
-        _suppr_func.run();
-    }
-
-    CLScheduler::get().enqueue(*_copy_array_kernel, false);
-
-    unsigned int get_num_corners = 0;
-    q.enqueueReadBuffer(_num_buffer, CL_TRUE, 0, sizeof(unsigned int), &get_num_corners);
-
-    size_t corner_size = std::min(static_cast<size_t>(get_num_corners), _corners->max_num_values());
-
-    _corners->resize(corner_size);
-
-    if(_num_corners != nullptr)
-    {
-        *_num_corners = get_num_corners;
-    }
-
-    q.flush();
-}

diff --git a/src/runtime/CL/functions/CLGaussian3x3.cpp b/src/runtime/CL/functions/CLGaussian3x3.cpp
deleted file mode 100644
index 8eeade2..0000000
--- a/src/runtime/CL/functions/CLGaussian3x3.cpp
+++ /dev/null

@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLGaussian3x3.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLGaussian3x3Kernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLGaussian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
-}
-
-void CLGaussian3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = std::make_unique<CLGaussian3x3Kernel>();
-    k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}

diff --git a/src/runtime/CL/functions/CLGaussian5x5.cpp b/src/runtime/CL/functions/CLGaussian5x5.cpp
deleted file mode 100644
index ee72fcb..0000000
--- a/src/runtime/CL/functions/CLGaussian5x5.cpp
+++ /dev/null

@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/ITensorAllocator.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLGaussian5x5Kernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-CLGaussian5x5::CLGaussian5x5(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)),
-      _kernel_hor(std::make_unique<CLGaussian5x5HorKernel>()),
-      _kernel_vert(std::make_unique<CLGaussian5x5VertKernel>()),
-      _border_handler(std::make_unique<CLFillBorderKernel>()),
-      _tmp()
-{
-}
-
-CLGaussian5x5::~CLGaussian5x5() = default;
-
-void CLGaussian5x5::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
-}
-
-void CLGaussian5x5::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-
-    _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, DataType::U16));
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_tmp);
-
-    // Configure kernels
-    _kernel_hor->configure(compile_context, input, &_tmp, border_mode == BorderMode::UNDEFINED);
-    _kernel_vert->configure(compile_context, &_tmp, output, border_mode == BorderMode::UNDEFINED);
-    _border_handler->configure(compile_context, input, _kernel_hor->border_size(), border_mode, PixelValue(constant_border_value));
-
-    // Allocate intermediate buffers
-    _tmp.allocator()->allocate();
-}
-
-void CLGaussian5x5::run()
-{
-    CLScheduler::get().enqueue(*_border_handler, false);
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    CLScheduler::get().enqueue(*_kernel_hor, false);
-    CLScheduler::get().enqueue(*_kernel_vert);
-}

diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp
deleted file mode 100644
index 9fe35f6..0000000
--- a/src/runtime/CL/functions/CLGaussianPyramid.cpp
+++ /dev/null

@@ -1,214 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/runtime/CL/CLPyramid.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLGaussian5x5Kernel.h"
-#include "src/core/CL/kernels/CLGaussianPyramidKernel.h"
-#include "src/core/CL/kernels/CLScaleKernel.h"
-
-#include <cstddef>
-
-using namespace arm_compute;
-
-CLGaussianPyramid::CLGaussianPyramid()
-    : _input(nullptr), _pyramid(nullptr), _tmp()
-{
-}
-
-CLGaussianPyramid::~CLGaussianPyramid() = default;
-
-CLGaussianPyramidHalf::CLGaussianPyramidHalf() // NOLINT
-    : _horizontal_border_handler(),
-      _vertical_border_handler(),
-      _horizontal_reduction(),
-      _vertical_reduction()
-{
-}
-
-CLGaussianPyramidHalf::~CLGaussianPyramidHalf() = default;
-
-void CLGaussianPyramidHalf::configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, pyramid, border_mode, constant_border_value);
-}
-
-void CLGaussianPyramidHalf::configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(pyramid == nullptr);
-    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
-    ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_HALF != pyramid->info()->scale());
-
-    // Constant value to use for vertical fill border when the border mode is CONSTANT
-    const uint16_t pixel_value_u16 = static_cast<uint16_t>(constant_border_value) * 2 + static_cast<uint16_t>(constant_border_value) * 8 + static_cast<uint16_t>(constant_border_value) * 6;
-
-    /* Get number of pyramid levels */
-    const size_t num_levels = pyramid->info()->num_levels();
-
-    _input   = input;
-    _pyramid = pyramid;
-
-    if(num_levels > 1)
-    {
-        _horizontal_border_handler.reserve(num_levels - 1);
-        _vertical_border_handler.reserve(num_levels - 1);
-        _horizontal_reduction.reserve(num_levels - 1);
-        _vertical_reduction.reserve(num_levels - 1);
-
-        // Apply half scale to the X dimension of the tensor shape
-        TensorShape tensor_shape = pyramid->info()->tensor_shape();
-        tensor_shape.set(0, (pyramid->info()->width() + 1) * SCALE_PYRAMID_HALF);
-
-        PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_HALF, tensor_shape, Format::U16);
-        _tmp.init(pyramid_info);
-
-        for(size_t i = 0; i < num_levels - 1; ++i)
-        {
-            /* Configure horizontal kernel */
-            _horizontal_reduction.emplace_back(std::make_unique<CLGaussianPyramidHorKernel>());
-            _horizontal_reduction.back()->configure(compile_context, _pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
-
-            /* Configure vertical kernel */
-            _vertical_reduction.emplace_back(std::make_unique<CLGaussianPyramidVertKernel>());
-            _vertical_reduction.back()->configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
-
-            /* Configure border */
-            _horizontal_border_handler.emplace_back(std::make_unique<CLFillBorderKernel>());
-            _horizontal_border_handler.back()->configure(compile_context, _pyramid->get_pyramid_level(i), _horizontal_reduction.back()->border_size(), border_mode, PixelValue(constant_border_value));
-
-            /* Configure border */
-            _vertical_border_handler.emplace_back(std::make_unique<CLFillBorderKernel>());
-            _vertical_border_handler.back()->configure(compile_context, _tmp.get_pyramid_level(i), _vertical_reduction.back()->border_size(), border_mode, PixelValue(pixel_value_u16));
-        }
-        _tmp.allocate();
-    }
-}
-
-void CLGaussianPyramidHalf::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_pyramid == nullptr, "Unconfigured function");
-
-    /* Get number of pyramid levels */
-    const size_t num_levels = _pyramid->info()->num_levels();
-
-    /* The first level of the pyramid has the input image */
-    _pyramid->get_pyramid_level(0)->map(CLScheduler::get().queue(), true /* blocking */);
-    _input->map(CLScheduler::get().queue(), true /* blocking */);
-    _pyramid->get_pyramid_level(0)->copy_from(*_input);
-
-    _input->unmap(CLScheduler::get().queue());
-    _pyramid->get_pyramid_level(0)->unmap(CLScheduler::get().queue());
-
-    for(unsigned int i = 0; i < num_levels - 1; ++i)
-    {
-        CLScheduler::get().enqueue(*_horizontal_border_handler[i], false);
-        CLScheduler::get().enqueue(*_horizontal_reduction[i], false);
-        CLScheduler::get().enqueue(*_vertical_border_handler[i], false);
-        CLScheduler::get().enqueue(*_vertical_reduction[i], false);
-    }
-}
-
-CLGaussianPyramidOrb::CLGaussianPyramidOrb() // NOLINT
-    : _gauss5x5(),
-      _scale_nearest()
-{
-}
-
-void CLGaussianPyramidOrb::configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, pyramid, border_mode, constant_border_value);
-}
-
-void CLGaussianPyramidOrb::configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
-    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
-    ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_ORB != pyramid->info()->scale());
-
-    /* Get number of pyramid levels */
-    const size_t num_levels = pyramid->info()->num_levels();
-
-    _input   = input;
-    _pyramid = pyramid;
-
-    if(num_levels > 1)
-    {
-        _gauss5x5.resize(num_levels - 1);
-        _scale_nearest.reserve(num_levels - 1);
-
-        PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
-
-        _tmp.init(pyramid_info);
-
-        for(size_t i = 0; i < num_levels - 1; ++i)
-        {
-            /* Configure gaussian 5x5 */
-            _gauss5x5[i].configure(compile_context, _pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
-
-            /* Configure scale image kernel */
-            _scale_nearest.emplace_back(std::make_unique<CLScaleKernel>());
-            _scale_nearest.back()->configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), ScaleKernelInfo{ InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, PixelValue(), SamplingPolicy::CENTER });
-        }
-
-        _tmp.allocate();
-    }
-}
-
-void CLGaussianPyramidOrb::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_pyramid == nullptr, "Unconfigured function");
-
-    /* Get number of pyramid levels */
-    const size_t num_levels = _pyramid->info()->num_levels();
-
-    /* The first level of the pyramid has the input image */
-    _pyramid->get_pyramid_level(0)->map(CLScheduler::get().queue(), true /* blocking */);
-    _input->map(CLScheduler::get().queue(), true /* blocking */);
-    _pyramid->get_pyramid_level(0)->copy_from(*_input);
-    _input->unmap(CLScheduler::get().queue());
-    _pyramid->get_pyramid_level(0)->unmap(CLScheduler::get().queue());
-
-    for(unsigned int i = 0; i < num_levels - 1; ++i)
-    {
-        _gauss5x5[i].run();
-        CLScheduler::get().enqueue(*_scale_nearest[i]);
-    }
-}

diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp
deleted file mode 100644
index 8d9ea17..0000000
--- a/src/runtime/CL/functions/CLHOGDescriptor.cpp
+++ /dev/null

@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLHOGDescriptor.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/HOGInfo.h"
-#include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLHOGDescriptorKernel.h"
-#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
-
-using namespace arm_compute;
-
-CLHOGDescriptor::CLHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)),
-      _gradient(),
-      _orient_bin(std::make_unique<CLHOGOrientationBinningKernel>()),
-      _block_norm(std::make_unique<CLHOGBlockNormalizationKernel>()),
-      _mag(),
-      _phase(),
-      _hog_space()
-{
-}
-
-CLHOGDescriptor::~CLHOGDescriptor() = default;
-
-void CLHOGDescriptor::configure(ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, hog, border_mode, constant_border_value);
-}
-
-void CLHOGDescriptor::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    ARM_COMPUTE_ERROR_ON(nullptr == hog);
-
-    const HOGInfo *hog_info = hog->info();
-    const size_t   width    = input->info()->dimension(Window::DimX);
-    const size_t   height   = input->info()->dimension(Window::DimY);
-    const size_t   num_bins = hog_info->num_bins();
-
-    Size2D cell_size = hog_info->cell_size();
-
-    // Calculate number of cells along the x and y directions for the hog_space
-    const size_t num_cells_x = width / cell_size.width;
-    const size_t num_cells_y = height / cell_size.height;
-
-    // TensorShape of the input image
-    const TensorShape &shape_img = input->info()->tensor_shape();
-
-    // TensorShape of the hog space
-    TensorShape shape_hog_space = input->info()->tensor_shape();
-    shape_hog_space.set(Window::DimX, num_cells_x);
-    shape_hog_space.set(Window::DimY, num_cells_y);
-
-    // Intitialize tensors for magnitude, phase and hog space
-    TensorInfo info_mag(shape_img, Format::S16);
-    _mag.allocator()->init(info_mag);
-
-    TensorInfo info_phase(shape_img, Format::U8);
-    _phase.allocator()->init(info_phase);
-
-    TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
-    _hog_space.allocator()->init(info_space);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_mag);
-    _memory_group.manage(&_phase);
-
-    // Initialise gradient kernel
-    _gradient.configure(compile_context, input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_hog_space);
-
-    // Initialise orientation binning kernel
-    _orient_bin->configure(compile_context, &_mag, &_phase, &_hog_space, hog->info());
-
-    // Initialize HOG norm kernel
-    _block_norm->configure(compile_context, &_hog_space, output, hog->info());
-
-    // Allocate intermediate tensors
-    _mag.allocator()->allocate();
-    _phase.allocator()->allocate();
-    _hog_space.allocator()->allocate();
-}
-
-void CLHOGDescriptor::run()
-{
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Run gradient
-    _gradient.run();
-
-    // Run orientation binning
-    CLScheduler::get().enqueue(*_orient_bin, false);
-
-    // Run block normalization
-    CLScheduler::get().enqueue(*_block_norm);
-}
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLHOGDetector.cpp b/src/runtime/CL/functions/CLHOGDetector.cpp
deleted file mode 100644
index 365021c..0000000
--- a/src/runtime/CL/functions/CLHOGDetector.cpp
+++ /dev/null

@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLHOGDetectorKernel.h"
-
-#include <algorithm>
-
-using namespace arm_compute;
-
-CLHOGDetector::CLHOGDetector()
-    : _hog_detector_kernel(std::make_unique<CLHOGDetectorKernel>()), _detection_windows(nullptr), _num_detection_windows()
-{
-}
-
-CLHOGDetector::~CLHOGDetector() = default;
-
-void CLHOGDetector::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, hog, detection_windows, detection_window_stride, threshold, idx_class);
-}
-
-void CLHOGDetector::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride,
-                              float threshold, size_t idx_class)
-{
-    _detection_windows = detection_windows;
-
-    // Allocate buffer for storing the number of detected objects
-    _num_detection_windows = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(unsigned int));
-
-    // Configure HOGDetectorKernel
-    _hog_detector_kernel->configure(compile_context, input, hog, detection_windows, &_num_detection_windows, detection_window_stride, threshold, idx_class);
-}
-
-void CLHOGDetector::run()
-{
-    cl::CommandQueue q = CLScheduler::get().queue();
-
-    // Reset number of detections
-    const unsigned int init_num_detection_windows = _detection_windows->num_values();
-    q.enqueueWriteBuffer(_num_detection_windows, CL_FALSE, 0, sizeof(unsigned int), &init_num_detection_windows);
-
-    // Run CLHOGDetectorKernel
-    CLScheduler::get().enqueue(*_hog_detector_kernel);
-
-    // Read number of detections
-    unsigned int num_detection_windows = 0;
-    q.enqueueReadBuffer(_num_detection_windows, CL_TRUE, 0, sizeof(unsigned int), &num_detection_windows);
-
-    // Update the number of values stored in _detection_windows
-    _detection_windows->resize(static_cast<size_t>(num_detection_windows));
-
-    q.flush();
-}
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp
deleted file mode 100644
index f3aa527..0000000
--- a/src/runtime/CL/functions/CLHOGGradient.cpp
+++ /dev/null

@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
-
-using namespace arm_compute;
-
-CLHOGGradient::CLHOGGradient(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)),
-      _derivative(),
-      _mag_phase(std::make_unique<CLMagnitudePhaseKernel>()),
-      _gx(),
-      _gy()
-{
-}
-
-void CLHOGGradient::configure(ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_magnitude, output_phase, phase_type, border_mode, constant_border_value);
-}
-
-void CLHOGGradient::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode,
-                              uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_magnitude, 1, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_phase, 1, DataType::U8);
-
-    const TensorShape &shape_img = input->info()->tensor_shape();
-
-    // Allocate image memory
-    TensorInfo info(shape_img, Format::S16);
-    _gx.allocator()->init(info);
-    _gy.allocator()->init(info);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_gx);
-    _memory_group.manage(&_gy);
-
-    // Initialise derivate kernel
-    _derivative.configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
-
-    // Initialise magnitude/phase kernel
-    if(PhaseType::UNSIGNED == phase_type)
-    {
-        _mag_phase->configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::UNSIGNED);
-    }
-    else
-    {
-        _mag_phase->configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::SIGNED);
-    }
-
-    // Allocate intermediate tensors
-    _gx.allocator()->allocate();
-    _gy.allocator()->allocate();
-}
-
-void CLHOGGradient::run()
-{
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Run derivative
-    _derivative.run();
-
-    // Run magnitude/phase kernel
-    CLScheduler::get().enqueue(*_mag_phase);
-}
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
deleted file mode 100644
index 2464e6c..0000000
--- a/src/runtime/CL/functions/CLHOGMultiDetection.cpp
+++ /dev/null

@@ -1,282 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLHOGMultiDetection.h"
-
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/runtime/CL/CLArray.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/Scheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLHOGDescriptorKernel.h"
-#include "src/core/CL/kernels/CLHOGDetectorKernel.h"
-#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
-
-using namespace arm_compute;
-
-CLHOGMultiDetection::CLHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _gradient_kernel(),
-      _orient_bin_kernel(),
-      _block_norm_kernel(),
-      _hog_detect_kernel(),
-      _non_maxima_kernel(),
-      _hog_space(),
-      _hog_norm_space(),
-      _detection_windows(),
-      _mag(),
-      _phase(),
-      _non_maxima_suppression(false),
-      _num_orient_bin_kernel(0),
-      _num_block_norm_kernel(0),
-      _num_hog_detect_kernel(0)
-{
-}
-
-CLHOGMultiDetection::~CLHOGMultiDetection() = default;
-
-void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, ICLSize2DArray *detection_window_strides, BorderMode border_mode,
-                                    uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, multi_hog, detection_windows, detection_window_strides, border_mode, constant_border_value, threshold, non_maxima_suppression,
-              min_distance);
-}
-
-void CLHOGMultiDetection::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows,
-                                    ICLSize2DArray *detection_window_strides, BorderMode border_mode,
-                                    uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_INVALID_MULTI_HOG(multi_hog);
-    ARM_COMPUTE_ERROR_ON(nullptr == detection_windows);
-    ARM_COMPUTE_ERROR_ON(detection_window_strides->num_values() != multi_hog->num_models());
-
-    const size_t       width      = input->info()->dimension(Window::DimX);
-    const size_t       height     = input->info()->dimension(Window::DimY);
-    const TensorShape &shape_img  = input->info()->tensor_shape();
-    const size_t       num_models = multi_hog->num_models();
-    PhaseType          phase_type = multi_hog->model(0)->info()->phase_type();
-
-    size_t prev_num_bins     = multi_hog->model(0)->info()->num_bins();
-    Size2D prev_cell_size    = multi_hog->model(0)->info()->cell_size();
-    Size2D prev_block_size   = multi_hog->model(0)->info()->block_size();
-    Size2D prev_block_stride = multi_hog->model(0)->info()->block_stride();
-
-    /* Check if CLHOGOrientationBinningKernel and CLHOGBlockNormalizationKernel kernels can be skipped for a specific HOG data-object
-     *
-     * 1) CLHOGOrientationBinningKernel and CLHOGBlockNormalizationKernel are skipped if the cell size and the number of bins don't change.
-     *        Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th
-     * 2) CLHOGBlockNormalizationKernel is skipped if the cell size, the number of bins and block size do not change.
-     *         Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th
-     *
-     * @note Since the orientation binning and block normalization kernels can be skipped, we need to keep track of the input to process for each kernel
-     *       with "input_orient_bin", "input_hog_detect" and "input_block_norm"
-     */
-    std::vector<size_t> input_orient_bin;
-    std::vector<size_t> input_hog_detect;
-    std::vector<std::pair<size_t, size_t>> input_block_norm;
-
-    input_orient_bin.push_back(0);
-    input_hog_detect.push_back(0);
-    input_block_norm.emplace_back(0, 0);
-
-    for(size_t i = 1; i < num_models; ++i)
-    {
-        size_t cur_num_bins     = multi_hog->model(i)->info()->num_bins();
-        Size2D cur_cell_size    = multi_hog->model(i)->info()->cell_size();
-        Size2D cur_block_size   = multi_hog->model(i)->info()->block_size();
-        Size2D cur_block_stride = multi_hog->model(i)->info()->block_stride();
-
-        if((cur_num_bins != prev_num_bins) || (cur_cell_size.width != prev_cell_size.width) || (cur_cell_size.height != prev_cell_size.height))
-        {
-            prev_num_bins     = cur_num_bins;
-            prev_cell_size    = cur_cell_size;
-            prev_block_size   = cur_block_size;
-            prev_block_stride = cur_block_stride;
-
-            // Compute orientation binning and block normalization kernels. Update input to process
-            input_orient_bin.push_back(i);
-            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
-        }
-        else if((cur_block_size.width != prev_block_size.width) || (cur_block_size.height != prev_block_size.height) || (cur_block_stride.width != prev_block_stride.width)
-                || (cur_block_stride.height != prev_block_stride.height))
-        {
-            prev_block_size   = cur_block_size;
-            prev_block_stride = cur_block_stride;
-
-            // Compute block normalization kernel. Update input to process
-            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
-        }
-
-        // Update input to process for hog detector kernel
-        input_hog_detect.push_back(input_block_norm.size() - 1);
-    }
-
-    _detection_windows      = detection_windows;
-    _non_maxima_suppression = non_maxima_suppression;
-    _num_orient_bin_kernel  = input_orient_bin.size(); // Number of CLHOGOrientationBinningKernel kernels to compute
-    _num_block_norm_kernel  = input_block_norm.size(); // Number of CLHOGBlockNormalizationKernel kernels to compute
-    _num_hog_detect_kernel  = input_hog_detect.size(); // Number of CLHOGDetector functions to compute
-
-    _orient_bin_kernel.reserve(_num_orient_bin_kernel);
-    _block_norm_kernel.reserve(_num_block_norm_kernel);
-    _hog_detect_kernel.resize(_num_hog_detect_kernel);
-    _hog_space.resize(_num_orient_bin_kernel);
-    _hog_norm_space.resize(_num_block_norm_kernel);
-
-    // Allocate tensors for magnitude and phase
-    TensorInfo info_mag(shape_img, Format::S16);
-    _mag.allocator()->init(info_mag);
-
-    TensorInfo info_phase(shape_img, Format::U8);
-    _phase.allocator()->init(info_phase);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_mag);
-    _memory_group.manage(&_phase);
-
-    // Initialise gradient kernel
-    _gradient_kernel.configure(compile_context, input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
-
-    // Configure NETensor for the HOG space and orientation binning kernel
-    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
-    {
-        const size_t idx_multi_hog = input_orient_bin[i];
-
-        // Get the corresponding cell size and number of bins
-        const Size2D &cell     = multi_hog->model(idx_multi_hog)->info()->cell_size();
-        const size_t  num_bins = multi_hog->model(idx_multi_hog)->info()->num_bins();
-
-        // Calculate number of cells along the x and y directions for the hog_space
-        const size_t num_cells_x = width / cell.width;
-        const size_t num_cells_y = height / cell.height;
-
-        // TensorShape of hog space
-        TensorShape shape_hog_space = input->info()->tensor_shape();
-        shape_hog_space.set(Window::DimX, num_cells_x);
-        shape_hog_space.set(Window::DimY, num_cells_y);
-
-        // Allocate HOG space
-        TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
-        _hog_space[i].allocator()->init(info_space);
-
-        // Manage intermediate buffers
-        _memory_group.manage(&_hog_space[i]);
-
-        // Initialise orientation binning kernel
-        _orient_bin_kernel.emplace_back(std::make_unique<CLHOGOrientationBinningKernel>());
-        _orient_bin_kernel.back()->configure(compile_context, &_mag, &_phase, &_hog_space[i], multi_hog->model(idx_multi_hog)->info());
-    }
-
-    // Allocate intermediate tensors
-    _mag.allocator()->allocate();
-    _phase.allocator()->allocate();
-
-    // Configure CLTensor for the normalized HOG space and block normalization kernel
-    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
-    {
-        const size_t idx_multi_hog  = input_block_norm[i].first;
-        const size_t idx_orient_bin = input_block_norm[i].second;
-
-        // Allocate normalized HOG space
-        TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
-        _hog_norm_space[i].allocator()->init(tensor_info);
-
-        // Manage intermediate buffers
-        _memory_group.manage(&_hog_norm_space[i]);
-
-        // Initialize block normalization kernel
-        _block_norm_kernel.emplace_back(std::make_unique<CLHOGBlockNormalizationKernel>());
-        _block_norm_kernel.back()->configure(compile_context, &_hog_space[idx_orient_bin], &_hog_norm_space[i], multi_hog->model(idx_multi_hog)->info());
-    }
-
-    // Allocate intermediate tensors
-    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
-    {
-        _hog_space[i].allocator()->allocate();
-    }
-
-    detection_window_strides->map(CLScheduler::get().queue(), true);
-
-    // Configure HOG detector kernel
-    for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
-    {
-        const size_t idx_block_norm = input_hog_detect[i];
-
-        _hog_detect_kernel[i].configure(compile_context, &_hog_norm_space[idx_block_norm], multi_hog->cl_model(i), detection_windows, detection_window_strides->at(i), threshold, i);
-    }
-
-    detection_window_strides->unmap(CLScheduler::get().queue());
-
-    // Configure non maxima suppression kernel
-    _non_maxima_kernel.configure(_detection_windows, min_distance);
-
-    // Allocate intermediate tensors
-    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
-    {
-        _hog_norm_space[i].allocator()->allocate();
-    }
-}
-
-void CLHOGMultiDetection::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Reset detection window
-    _detection_windows->clear();
-
-    // Run gradient
-    _gradient_kernel.run();
-
-    // Run orientation binning kernel
-    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
-    {
-        CLScheduler::get().enqueue(*_orient_bin_kernel[i], false);
-    }
-
-    // Run block normalization kernel
-    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
-    {
-        CLScheduler::get().enqueue(*_block_norm_kernel[i], false);
-    }
-
-    // Run HOG detector kernel
-    for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
-    {
-        _hog_detect_kernel[i].run();
-    }
-
-    // Run non-maxima suppression kernel if enabled
-    if(_non_maxima_suppression)
-    {
-        // Map detection windows array before computing non maxima suppression
-        _detection_windows->map(CLScheduler::get().queue(), true);
-        Scheduler::get().schedule(&_non_maxima_kernel, Window::DimY);
-        _detection_windows->unmap(CLScheduler::get().queue());
-    }
-}

diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp
deleted file mode 100644
index 37f428c..0000000
--- a/src/runtime/CL/functions/CLHarrisCorners.cpp
+++ /dev/null

@@ -1,198 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLHarrisCorners.h"
-
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
-#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
-#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
-#include "arm_compute/runtime/ITensorAllocator.h"
-#include "arm_compute/runtime/Scheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLHarrisCornersKernel.h"
-#include "src/core/CL/kernels/CLSobel5x5Kernel.h"
-#include "src/core/CL/kernels/CLSobel7x7Kernel.h"
-
-#include <cmath>
-#include <utility>
-
-using namespace arm_compute;
-
-CLHarrisCorners::CLHarrisCorners(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _sobel(nullptr),
-      _harris_score(std::make_unique<CLHarrisScoreKernel>()),
-      _non_max_suppr(),
-      _candidates(),
-      _sort_euclidean(),
-      _border_gx(std::make_unique<CLFillBorderKernel>()),
-      _border_gy(std::make_unique<CLFillBorderKernel>()),
-      _gx(),
-      _gy(),
-      _score(),
-      _nonmax(),
-      _corners_list(),
-      _num_corner_candidates(0),
-      _corners(nullptr)
-{
-}
-
-CLHarrisCorners::~CLHarrisCorners() = default;
-
-void CLHarrisCorners::configure(ICLImage *input, float threshold, float min_dist,
-                                float sensitivity, int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners,
-                                BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, threshold, min_dist, sensitivity, gradient_size, block_size, corners, border_mode, constant_border_value, use_fp16);
-}
-
-void CLHarrisCorners::configure(const CLCompileContext &compile_context, ICLImage *input, float threshold, float min_dist,
-                                float sensitivity, int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners,
-                                BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
-{
-    ARM_COMPUTE_UNUSED(use_fp16); //TODO(COMPMID-772): Add half float support
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7));
-    ARM_COMPUTE_ERROR_ON(nullptr == corners);
-
-    _corners = corners;
-
-    const TensorShape shape = input->info()->tensor_shape();
-    const DataType    dt    = (gradient_size < 7) ? DataType::S16 : DataType::S32;
-    TensorInfo        tensor_info(shape, 1, dt);
-
-    _gx.allocator()->init(tensor_info);
-    _gy.allocator()->init(tensor_info);
-
-    TensorInfo info_f32(shape, 1, DataType::F32);
-    _score.allocator()->init(info_f32);
-    _nonmax.allocator()->init(info_f32);
-
-    _corners_list.resize(shape.x() * shape.y());
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_gx);
-    _memory_group.manage(&_gy);
-
-    /* Set/init Sobel kernel accordingly with gradient_size */
-    switch(gradient_size)
-    {
-        case 3:
-        {
-            auto k = std::make_unique<CLSobel3x3>();
-            k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
-            _sobel = std::move(k);
-            break;
-        }
-        case 5:
-        {
-            auto k = std::make_unique<CLSobel5x5>();
-            k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
-            _sobel = std::move(k);
-            break;
-        }
-        case 7:
-        {
-            auto k = std::make_unique<CLSobel7x7>();
-            k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
-            _sobel = std::move(k);
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Gradient size not implemented");
-    }
-
-    // Normalization factor
-    const float norm_factor               = 1.0f / (255.0f * pow(4.0f, gradient_size / 2) * block_size);
-    const float pow4_normalization_factor = pow(norm_factor, 4);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_score);
-
-    // Set/init Harris Score kernel accordingly with block_size
-    _harris_score->configure(compile_context, &_gx, &_gy, &_score, block_size, pow4_normalization_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
-
-    // Configure border filling using harris score kernel's block size
-    _border_gx->configure(compile_context, &_gx, _harris_score->border_size(), border_mode, PixelValue(constant_border_value));
-    _border_gy->configure(compile_context, &_gy, _harris_score->border_size(), border_mode, PixelValue(constant_border_value));
-
-    // Allocate intermediate buffers
-    _gx.allocator()->allocate();
-    _gy.allocator()->allocate();
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_nonmax);
-
-    // Init non-maxima suppression function
-    _non_max_suppr.configure(compile_context, &_score, &_nonmax, border_mode);
-
-    // Allocate intermediate buffers
-    _score.allocator()->allocate();
-
-    // Init corner candidates kernel
-    _candidates.configure(&_nonmax, _corners_list.data(), &_num_corner_candidates);
-
-    // Allocate intermediate buffers
-    _nonmax.allocator()->allocate();
-
-    // Init euclidean distance
-    _sort_euclidean.configure(_corners_list.data(), _corners, &_num_corner_candidates, min_dist);
-}
-
-void CLHarrisCorners::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Init to 0 number of corner candidates
-    _num_corner_candidates = 0;
-
-    // Run Sobel kernel
-    _sobel->run();
-
-    // Fill border before harris score kernel
-    CLScheduler::get().enqueue(*_border_gx, false);
-    CLScheduler::get().enqueue(*_border_gy, false);
-
-    // Run harris score kernel
-    CLScheduler::get().enqueue(*_harris_score, false);
-
-    // Run non-maxima suppression
-    _non_max_suppr.run();
-
-    // Run corner candidate kernel
-    _nonmax.map(true);
-    Scheduler::get().schedule(&_candidates, Window::DimY);
-    _nonmax.unmap();
-
-    _corners->map(CLScheduler::get().queue(), true);
-    Scheduler::get().schedule(&_sort_euclidean, Window::DimY);
-    _corners->unmap(CLScheduler::get().queue());
-}

diff --git a/src/runtime/CL/functions/CLHistogram.cpp b/src/runtime/CL/functions/CLHistogram.cpp
deleted file mode 100644
index f278cf0..0000000
--- a/src/runtime/CL/functions/CLHistogram.cpp
+++ /dev/null

@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLHistogram.h"
-
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-using namespace arm_compute;
-
-CLHistogram::CLHistogram()
-    : _kernel(), _kernel_border()
-{
-}
-
-void CLHistogram::configure(const ICLImage *input, ICLDistribution1D *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLHistogram::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output)
-{
-    _kernel.configure(compile_context, input, output);
-    _kernel_border.configure(compile_context, input, output);
-}
-
-void CLHistogram::run()
-{
-    CLScheduler::get().enqueue(_kernel, false);
-    CLScheduler::get().enqueue(_kernel_border);
-}

diff --git a/src/runtime/CL/functions/CLIntegralImage.cpp b/src/runtime/CL/functions/CLIntegralImage.cpp
deleted file mode 100644
index 56a151a..0000000
--- a/src/runtime/CL/functions/CLIntegralImage.cpp
+++ /dev/null

@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLIntegralImage.h"
-
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLIntegralImageKernel.h"
-
-using namespace arm_compute;
-
-CLIntegralImage::CLIntegralImage()
-    : _integral_hor(std::make_unique<CLIntegralImageHorKernel>()),
-      _integral_vert(std::make_unique<CLIntegralImageVertKernel>())
-{
-}
-
-CLIntegralImage::~CLIntegralImage() = default;
-
-void CLIntegralImage::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLIntegralImage::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    _integral_hor->configure(compile_context, input, output);
-    _integral_vert->configure(compile_context, output);
-}
-
-void CLIntegralImage::run()
-{
-    CLScheduler::get().enqueue(*_integral_hor, false);
-    CLScheduler::get().enqueue(*_integral_vert);
-}

diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
deleted file mode 100644
index 1ad19e5..0000000
--- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp
+++ /dev/null

@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLLaplacianPyramid.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/IPyramid.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h"
-#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
-#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
-#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLGaussian5x5Kernel.h"
-#include "src/core/CL/kernels/CLGaussianPyramidKernel.h"
-
-using namespace arm_compute;
-
-CLLaplacianPyramid::CLLaplacianPyramid() // NOLINT
-    : _num_levels(0),
-      _gaussian_pyr_function(),
-      _convf(),
-      _subf(),
-      _depth_function(),
-      _gauss_pyr(),
-      _conv_pyr()
-{
-}
-
-void CLLaplacianPyramid::configure(ICLTensor *input, CLPyramid *pyramid, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, pyramid, output, border_mode, constant_border_value);
-}
-
-void CLLaplacianPyramid::configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16);
-    ARM_COMPUTE_ERROR_ON(0 == pyramid->info()->num_levels());
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(0));
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(1));
-
-    _num_levels = pyramid->info()->num_levels();
-
-    // Create and initialize the gaussian pyramid and the convoluted pyramid
-    PyramidInfo pyramid_info;
-    pyramid_info.init(_num_levels, 0.5f, pyramid->info()->tensor_shape(), arm_compute::Format::U8);
-
-    _gauss_pyr.init(pyramid_info);
-    _conv_pyr.init(pyramid_info);
-
-    // Create Gaussian Pyramid function
-    _gaussian_pyr_function.configure(compile_context, input, &_gauss_pyr, border_mode, constant_border_value);
-
-    _convf.resize(_num_levels);
-    _subf.resize(_num_levels);
-
-    for(unsigned int i = 0; i < _num_levels; ++i)
-    {
-        _convf[i].configure(compile_context, _gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), border_mode, constant_border_value);
-        _subf[i].configure(compile_context, _gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), pyramid->get_pyramid_level(i), ConvertPolicy::WRAP);
-    }
-
-    _depth_function.configure(compile_context, _conv_pyr.get_pyramid_level(_num_levels - 1), output, ConvertPolicy::WRAP, 0);
-
-    _gauss_pyr.allocate();
-    _conv_pyr.allocate();
-}
-
-void CLLaplacianPyramid::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(0 == _num_levels, "Unconfigured function");
-
-    _gaussian_pyr_function.run(); // compute gaussian pyramid
-
-    for(unsigned int i = 0; i < _num_levels; ++i)
-    {
-        _convf[i].run(); // convolute gaussian pyramid
-    }
-
-    for(unsigned int i = 0; i < _num_levels; ++i)
-    {
-        _subf[i].run(); // compute laplacian image
-    }
-
-    _depth_function.run();
-}

diff --git a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
deleted file mode 100644
index d7fd817..0000000
--- a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
+++ /dev/null

@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/IPyramid.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-
-#include <cstddef>
-
-using namespace arm_compute;
-
-CLLaplacianReconstruct::CLLaplacianReconstruct() // NOLINT
-    : _tmp_pyr(),
-      _addf(),
-      _scalef(),
-      _depthf()
-{
-}
-
-void CLLaplacianReconstruct::configure(const CLPyramid *pyramid, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), pyramid, input, output, border_mode, constant_border_value);
-}
-
-void CLLaplacianReconstruct::configure(const CLCompileContext &compile_context, const CLPyramid *pyramid, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
-    ARM_COMPUTE_ERROR_ON(input == output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
-    ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != pyramid->get_pyramid_level(0)->info()->dimension(0));
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != pyramid->get_pyramid_level(0)->info()->dimension(1));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(0));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(1));
-
-    const size_t num_levels = pyramid->info()->num_levels();
-
-    // Create and initialize the tmp pyramid: I(n-2) = upsample( input + Laplace(n-1) )
-    PyramidInfo pyramid_info;
-    pyramid_info.init(num_levels, 0.5f, output->info()->tensor_shape(), arm_compute::Format::S16);
-    _tmp_pyr.init(pyramid_info);
-
-    // Allocate add and scale functions. Level 0 does not need to be scaled.
-    _addf.resize(num_levels);
-    _scalef.resize(num_levels - 1);
-
-    const size_t last_level = num_levels - 1;
-
-    _addf[last_level].configure(compile_context, input, pyramid->get_pyramid_level(last_level), _tmp_pyr.get_pyramid_level(last_level), ConvertPolicy::SATURATE);
-
-    // Scale levels n-1 to 1, and add levels n-2 to 0
-    for(size_t l = 0; l < last_level; ++l)
-    {
-        _scalef[l].configure(compile_context, _tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), ScaleKernelInfo{ arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value });
-        _addf[l].configure(compile_context, _tmp_pyr.get_pyramid_level(l), pyramid->get_pyramid_level(l), _tmp_pyr.get_pyramid_level(l), ConvertPolicy::SATURATE);
-    }
-
-    // Convert level 0 from S16 to U8
-    _depthf.configure(compile_context, _tmp_pyr.get_pyramid_level(0), output, ConvertPolicy::SATURATE, 0);
-
-    _tmp_pyr.allocate();
-}
-
-void CLLaplacianReconstruct::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_addf.empty(), "Unconfigured function");
-
-    const size_t last_level = _tmp_pyr.info()->num_levels() - 1;
-
-    _addf[last_level].run();
-
-    // Run l = [last_level - 1, 0]
-    for(size_t l = last_level; l-- > 0;)
-    {
-        _scalef[l].run();
-        _addf[l].run();
-    }
-
-    _depthf.run();
-}

diff --git a/src/runtime/CL/functions/CLMagnitude.cpp b/src/runtime/CL/functions/CLMagnitude.cpp
deleted file mode 100644
index 0599a11..0000000
--- a/src/runtime/CL/functions/CLMagnitude.cpp
+++ /dev/null

@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLMagnitude.h"
-
-#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLMagnitude::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, mag_type);
-}
-
-void CLMagnitude::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type)
-{
-    auto k = std::make_unique<CLMagnitudePhaseKernel>();
-    k->configure(compile_context, input1, input2, output, nullptr, mag_type);
-    _kernel = std::move(k);
-}

diff --git a/src/runtime/CL/functions/CLMeanStdDev.cpp b/src/runtime/CL/functions/CLMeanStdDev.cpp
deleted file mode 100644
index d8cd41d..0000000
--- a/src/runtime/CL/functions/CLMeanStdDev.cpp
+++ /dev/null

@@ -1,177 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/TensorInfo.h"
-
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/functions/CLMeanStdDev.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLMeanStdDevKernel.h"
-#include "src/core/CL/kernels/CLReductionOperationKernel.h"
-
-using namespace arm_compute;
-
-CLMeanStdDev::CLMeanStdDev(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _data_type(),
-      _num_pixels(),
-      _run_stddev(),
-      _reduction_operation_mean(),
-      _reduction_operation_stddev(),
-      _reduction_output_mean(),
-      _reduction_output_stddev(),
-      _mean(nullptr),
-      _stddev(nullptr),
-      _mean_stddev_kernel(std::make_unique<CLMeanStdDevKernel>()),
-      _fill_border_kernel(std::make_unique<CLFillBorderKernel>()),
-      _global_sum(),
-      _global_sum_squared()
-{
-}
-
-CLMeanStdDev::~CLMeanStdDev() = default;
-
-Status CLMeanStdDev::validate(ITensorInfo *input, float *mean, float *stddev)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_TENSOR_NOT_2D(input);
-    if(is_data_type_float(input->data_type()))
-    {
-        ARM_COMPUTE_UNUSED(mean);
-        ARM_COMPUTE_UNUSED(stddev);
-
-        TensorShape output_shape      = TensorShape{ 1, input->dimension(1) };
-        TensorInfo  output_shape_info = TensorInfo(output_shape, 1, DataType::U8);
-        return CLReductionOperation::validate(input, &output_shape_info, 0, ReductionOperation::SUM);
-    }
-    else
-    {
-        return CLMeanStdDevKernel::validate(input, mean, nullptr, stddev, nullptr);
-    }
-}
-
-void CLMeanStdDev::configure(ICLImage *input, float *mean, float *stddev)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, mean, stddev);
-}
-
-void CLMeanStdDev::configure(const CLCompileContext &compile_context, ICLImage *input, float *mean, float *stddev)
-{
-    // In the case of F16/F32 we call reduction operation for calculating CLMeanStdDev
-    _data_type = input->info()->data_type();
-
-    if(is_data_type_float(_data_type))
-    {
-        _num_pixels = input->info()->dimension(0) * input->info()->dimension(1);
-
-        _memory_group.manage(&_reduction_output_mean);
-        _reduction_operation_mean.configure(compile_context, input, &_reduction_output_mean, 0, ReductionOperation::SUM);
-        _reduction_output_mean.allocator()->allocate();
-        _mean = mean;
-
-        if(stddev != nullptr)
-        {
-            _memory_group.manage(&_reduction_output_stddev);
-            _reduction_operation_stddev.configure(compile_context, input, &_reduction_output_stddev, 0, ReductionOperation::SUM_SQUARE);
-            _reduction_output_stddev.allocator()->allocate();
-            _stddev     = stddev;
-            _run_stddev = true;
-        }
-    }
-    else
-    {
-        _global_sum = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong));
-
-        if(stddev != nullptr)
-        {
-            _global_sum_squared = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong));
-        }
-
-        _mean_stddev_kernel->configure(compile_context, input, mean, &_global_sum, stddev, &_global_sum_squared);
-        _fill_border_kernel->configure(compile_context, input, _mean_stddev_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint8_t>(0)));
-    }
-}
-
-template <typename T>
-void CLMeanStdDev::run_float()
-{
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Perform reduction on x-axis
-    _reduction_operation_mean.run();
-    if(_run_stddev)
-    {
-        _reduction_operation_stddev.run();
-        _reduction_output_stddev.map(true);
-    }
-
-    _reduction_output_mean.map(true);
-
-    auto mean = static_cast<T>(0);
-
-    // Calculate final result for mean
-    for(unsigned int i = 0; i < _reduction_output_mean.info()->dimension(1); ++i)
-    {
-        mean += *reinterpret_cast<T *>(_reduction_output_mean.buffer() + _reduction_output_mean.info()->offset_element_in_bytes(Coordinates(0, i)));
-    }
-
-    mean /= _num_pixels;
-    *_mean = mean;
-
-    if(_run_stddev)
-    {
-        auto stddev = static_cast<T>(0);
-        // Calculate final result for stddev
-        for(unsigned int i = 0; i < _reduction_output_stddev.info()->dimension(1); ++i)
-        {
-            stddev += *reinterpret_cast<T *>(_reduction_output_stddev.buffer() + _reduction_output_stddev.info()->offset_element_in_bytes(Coordinates(0, i)));
-        }
-        *_stddev = std::sqrt((stddev / _num_pixels) - (mean * mean));
-
-        _reduction_output_stddev.unmap();
-    }
-    _reduction_output_mean.unmap();
-}
-
-void CLMeanStdDev::run_int()
-{
-    CLScheduler::get().enqueue(*_fill_border_kernel);
-    CLScheduler::get().enqueue(*_mean_stddev_kernel);
-}
-
-void CLMeanStdDev::run()
-{
-    switch(_data_type)
-    {
-        case DataType::F16:
-            run_float<half>();
-            break;
-        case DataType::F32:
-            run_float<float>();
-            break;
-        case DataType::U8:
-            run_int();
-            break;
-        default:
-            ARM_COMPUTE_ERROR_ON("Not supported");
-    }
-}

diff --git a/src/runtime/CL/functions/CLMedian3x3.cpp b/src/runtime/CL/functions/CLMedian3x3.cpp
deleted file mode 100644
index b32063a..0000000
--- a/src/runtime/CL/functions/CLMedian3x3.cpp
+++ /dev/null

@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLMedian3x3.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLMedian3x3Kernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLMedian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
-}
-
-void CLMedian3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = std::make_unique<CLMedian3x3Kernel>();
-    k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}

diff --git a/src/runtime/CL/functions/CLMinMaxLocation.cpp b/src/runtime/CL/functions/CLMinMaxLocation.cpp
deleted file mode 100644
index ace6a1c..0000000
--- a/src/runtime/CL/functions/CLMinMaxLocation.cpp
+++ /dev/null

@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLMinMaxLocation.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "src/core/CL/kernels/CLMinMaxLocationKernel.h"
-
-namespace arm_compute
-{
-CLMinMaxLocation::CLMinMaxLocation()
-    : _min_max_kernel(std::make_unique<CLMinMaxKernel>()),
-      _min_max_loc_kernel(std::make_unique<CLMinMaxLocationKernel>()),
-      _min_max_vals(),
-      _min_max_count_vals(),
-      _min(nullptr),
-      _max(nullptr),
-      _min_count(nullptr),
-      _max_count(nullptr),
-      _min_loc(nullptr),
-      _max_loc(nullptr)
-{
-}
-
-CLMinMaxLocation::~CLMinMaxLocation() = default;
-
-void CLMinMaxLocation::configure(const ICLImage *input, void *min, void *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, min, max, min_loc, max_loc, min_count, max_count);
-}
-
-void CLMinMaxLocation::configure(const CLCompileContext &compile_context, const ICLImage *input, void *min, void *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc,
-                                 uint32_t *min_count,
-                                 uint32_t *max_count)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == min);
-    ARM_COMPUTE_ERROR_ON(nullptr == max);
-
-    _min_max_vals       = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 2 * sizeof(int32_t));
-    _min_max_count_vals = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 2 * sizeof(uint32_t));
-    _min                = min;
-    _max                = max;
-    _min_count          = min_count;
-    _max_count          = max_count;
-    _min_loc            = min_loc;
-    _max_loc            = max_loc;
-
-    _min_max_kernel->configure(compile_context, input, &_min_max_vals);
-    _min_max_loc_kernel->configure(compile_context, input, &_min_max_vals, &_min_max_count_vals, _min_loc, _max_loc);
-}
-
-void CLMinMaxLocation::run()
-{
-    cl::CommandQueue q = CLScheduler::get().queue();
-
-    CLScheduler::get().enqueue(*_min_max_kernel, false);
-    CLScheduler::get().enqueue(*_min_max_loc_kernel, false);
-
-    // Update min and max
-    q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 0 * sizeof(int32_t), sizeof(int32_t), static_cast<int32_t *>(_min));
-    q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 1 * sizeof(int32_t), sizeof(int32_t), static_cast<int32_t *>(_max));
-
-    // Update min and max count
-    if(_min_count != nullptr)
-    {
-        q.enqueueReadBuffer(_min_max_count_vals, CL_FALSE, 0 * sizeof(uint32_t), sizeof(uint32_t), _min_count);
-    }
-    if(_max_count != nullptr)
-    {
-        q.enqueueReadBuffer(_min_max_count_vals, CL_FALSE, 1 * sizeof(uint32_t), sizeof(uint32_t), _max_count);
-    }
-
-    // Update min/max point arrays (Makes the kernel blocking)
-    if(_min_loc != nullptr)
-    {
-        unsigned int min_count = 0;
-        q.enqueueReadBuffer(_min_max_count_vals, CL_TRUE, 0 * sizeof(uint32_t), sizeof(uint32_t), &min_count);
-        size_t min_corner_size = std::min(static_cast<size_t>(min_count), _min_loc->max_num_values());
-        _min_loc->resize(min_corner_size);
-    }
-    if(_max_loc != nullptr)
-    {
-        unsigned int max_count = 0;
-        q.enqueueReadBuffer(_min_max_count_vals, CL_TRUE, 1 * sizeof(uint32_t), sizeof(uint32_t), &max_count);
-        size_t max_corner_size = std::min(static_cast<size_t>(max_count), _max_loc->max_num_values());
-        _max_loc->resize(max_corner_size);
-    }
-}
-} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLNonLinearFilter.cpp b/src/runtime/CL/functions/CLNonLinearFilter.cpp
deleted file mode 100644
index ec88f87..0000000
--- a/src/runtime/CL/functions/CLNonLinearFilter.cpp
+++ /dev/null

@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLNonLinearFilter.h"
-
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLNonLinearFilterKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLNonLinearFilter::configure(ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
-                                  BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, function, mask_size, pattern, mask, border_mode, constant_border_value);
-}
-
-void CLNonLinearFilter::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern,
-                                  const uint8_t *mask, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = std::make_unique<CLNonLinearFilterKernel>();
-    k->configure(compile_context, input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}

diff --git a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
deleted file mode 100644
index 5906ea5..0000000
--- a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
+++ /dev/null

@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h"
-
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLNonMaximaSuppression3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode);
-}
-
-void CLNonMaximaSuppression3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode)
-{
-    auto k = std::make_unique<CLNonMaximaSuppression3x3Kernel>();
-    k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-
-    if(border_mode != BorderMode::UNDEFINED)
-    {
-        _border_handler->configure(compile_context, input, _kernel->border_size(), BorderMode::CONSTANT);
-    }
-    else
-    {
-        _border_handler->configure(compile_context, input, _kernel->border_size(), BorderMode::UNDEFINED);
-    }
-}

diff --git a/src/runtime/CL/functions/CLOpticalFlow.cpp b/src/runtime/CL/functions/CLOpticalFlow.cpp
deleted file mode 100644
index 76e0ac5..0000000
--- a/src/runtime/CL/functions/CLOpticalFlow.cpp
+++ /dev/null

@@ -1,184 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLOpticalFlow.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/runtime/CL/CLPyramid.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLLKTrackerKernel.h"
-
-using namespace arm_compute;
-
-CLOpticalFlow::CLOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _tracker_init_kernel(),
-      _tracker_stage0_kernel(),
-      _tracker_stage1_kernel(),
-      _tracker_finalize_kernel(std::make_unique<CLLKTrackerFinalizeKernel>()),
-      _func_scharr(),
-      _scharr_gx(),
-      _scharr_gy(),
-      _old_points(nullptr),
-      _new_points_estimates(nullptr),
-      _new_points(nullptr),
-      _old_points_internal(),
-      _new_points_internal(),
-      _coefficient_table(),
-      _old_values(),
-      _num_levels(0)
-{
-}
-
-CLOpticalFlow::~CLOpticalFlow() = default;
-
-void CLOpticalFlow::configure(const CLPyramid *old_pyramid, const CLPyramid *new_pyramid,
-                              const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points,
-                              Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate,
-                              BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), old_pyramid, new_pyramid, old_points, new_points_estimates, new_points, termination, epsilon, num_iterations, window_dimension,
-              use_initial_estimate, border_mode, constant_border_value);
-}
-
-void CLOpticalFlow::configure(const CLCompileContext &compile_context, const CLPyramid *old_pyramid, const CLPyramid *new_pyramid,
-                              const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points,
-                              Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate,
-                              BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == old_pyramid);
-    ARM_COMPUTE_ERROR_ON(nullptr == new_pyramid);
-    ARM_COMPUTE_ERROR_ON(nullptr == old_points);
-    ARM_COMPUTE_ERROR_ON(nullptr == new_points_estimates);
-    ARM_COMPUTE_ERROR_ON(nullptr == new_points);
-    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->num_levels() != new_pyramid->info()->num_levels());
-    ARM_COMPUTE_ERROR_ON(0 == old_pyramid->info()->num_levels());
-    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->width() != new_pyramid->info()->width());
-    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->height() != new_pyramid->info()->height());
-    ARM_COMPUTE_ERROR_ON(use_initial_estimate && old_points->num_values() != new_points_estimates->num_values());
-
-    // Set member variables
-    _old_points           = old_points;
-    _new_points_estimates = new_points_estimates;
-    _new_points           = new_points;
-    _num_levels           = old_pyramid->info()->num_levels();
-
-    const float pyr_scale              = old_pyramid->info()->scale();
-    const int   list_length            = old_points->num_values();
-    const int   old_values_list_length = list_length * window_dimension * window_dimension;
-
-    // Create kernels and tensors
-    _tracker_init_kernel.reserve(_num_levels);
-    _tracker_stage0_kernel.reserve(_num_levels);
-    _tracker_stage1_kernel.reserve(_num_levels);
-    _func_scharr.resize(_num_levels);
-    _scharr_gx.resize(_num_levels);
-    _scharr_gy.resize(_num_levels);
-
-    // Create internal keypoint arrays
-    _old_points_internal = std::make_unique<CLLKInternalKeypointArray>(list_length);
-    _old_points_internal->resize(list_length);
-    _new_points_internal = std::make_unique<CLLKInternalKeypointArray>(list_length);
-    _new_points_internal->resize(list_length);
-    _coefficient_table = std::make_unique<CLCoefficientTableArray>(list_length);
-    _coefficient_table->resize(list_length);
-    _old_values = std::make_unique<CLOldValueArray>(old_values_list_length);
-    _old_values->resize(old_values_list_length);
-    _new_points->resize(list_length);
-
-    for(size_t i = 0; i < _num_levels; ++i)
-    {
-        // Get images from the ith level of old and right pyramid
-        ICLImage *old_ith_input = old_pyramid->get_pyramid_level(i);
-        ICLImage *new_ith_input = new_pyramid->get_pyramid_level(i);
-
-        // Get width and height of images
-        const unsigned int width_ith  = old_ith_input->info()->dimension(0);
-        const unsigned int height_ith = new_ith_input->info()->dimension(1);
-
-        // Initialize Scharr tensors
-        TensorInfo tensor_info(TensorShape(width_ith, height_ith), 1, DataType::S16);
-        _scharr_gx[i].allocator()->init(tensor_info);
-        _scharr_gy[i].allocator()->init(tensor_info);
-
-        // Manage intermediate buffers
-        _memory_group.manage(&_scharr_gx[i]);
-        _memory_group.manage(&_scharr_gy[i]);
-
-        // Init Scharr kernel
-        _func_scharr[i].configure(compile_context, old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value);
-
-        // Init Lucas-Kanade init kernel
-        _tracker_init_kernel.emplace_back(std::make_unique<CLLKTrackerInitKernel>());
-        _tracker_init_kernel.back()->configure(compile_context, old_points, new_points_estimates, _old_points_internal.get(), _new_points_internal.get(), use_initial_estimate, i, _num_levels, pyr_scale);
-
-        // Init Lucas-Kanade stage0 kernel
-        _tracker_stage0_kernel.emplace_back(std::make_unique<CLLKTrackerStage0Kernel>());
-        _tracker_stage0_kernel.back()->configure(compile_context, old_ith_input, &_scharr_gx[i], &_scharr_gy[i],
-                                                 _old_points_internal.get(), _new_points_internal.get(), _coefficient_table.get(), _old_values.get(),
-                                                 window_dimension, i);
-
-        // Init Lucas-Kanade stage1 kernel
-        _tracker_stage1_kernel.emplace_back(std::make_unique<CLLKTrackerStage1Kernel>());
-        _tracker_stage1_kernel.back()->configure(compile_context, new_ith_input, _new_points_internal.get(), _coefficient_table.get(), _old_values.get(),
-                                                 termination, epsilon, num_iterations, window_dimension, i);
-
-        // Allocate intermediate buffers
-        _scharr_gx[i].allocator()->allocate();
-        _scharr_gy[i].allocator()->allocate();
-    }
-
-    // Finalize Lucas-Kanade
-    _tracker_finalize_kernel->configure(compile_context, _new_points_internal.get(), new_points);
-}
-
-void CLOpticalFlow::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    for(unsigned int level = _num_levels; level > 0; --level)
-    {
-        // Run Scharr kernel
-        _func_scharr[level - 1].run();
-
-        // Run Lucas-Kanade init kernel
-        CLScheduler::get().enqueue(*_tracker_init_kernel[level - 1]);
-
-        // Run Lucas-Kanade stage0 kernel
-        CLScheduler::get().enqueue(*_tracker_stage0_kernel[level - 1]);
-
-        // Run Lucas-Kanade stage1 kernel
-        CLScheduler::get().enqueue(*_tracker_stage1_kernel[level - 1]);
-    }
-
-    CLScheduler::get().enqueue(*_tracker_finalize_kernel, true);
-}

diff --git a/src/runtime/CL/functions/CLPhase.cpp b/src/runtime/CL/functions/CLPhase.cpp
deleted file mode 100644
index b2ff5d0..0000000
--- a/src/runtime/CL/functions/CLPhase.cpp
+++ /dev/null

@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLPhase.h"
-
-#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLPhase::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, phase_type);
-}
-
-void CLPhase::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type)
-{
-    auto k = std::make_unique<CLMagnitudePhaseKernel>();
-    k->configure(compile_context, input1, input2, nullptr, output, MagnitudeType::L1NORM, phase_type);
-    _kernel = std::move(k);
-}

diff --git a/src/runtime/CL/functions/CLScharr3x3.cpp b/src/runtime/CL/functions/CLScharr3x3.cpp
deleted file mode 100644
index 563ec19..0000000
--- a/src/runtime/CL/functions/CLScharr3x3.cpp
+++ /dev/null

@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLScharr3x3Kernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLScharr3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
-}
-
-void CLScharr3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = std::make_unique<CLScharr3x3Kernel>();
-    k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}

diff --git a/src/runtime/CL/functions/CLSobel3x3.cpp b/src/runtime/CL/functions/CLSobel3x3.cpp
deleted file mode 100644
index 6724c12..0000000
--- a/src/runtime/CL/functions/CLSobel3x3.cpp
+++ /dev/null

@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLSobel3x3Kernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-CLSobel3x3::~CLSobel3x3() = default;
-
-void CLSobel3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
-}
-
-void CLSobel3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = std::make_unique<CLSobel3x3Kernel>();
-    k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}

diff --git a/src/runtime/CL/functions/CLSobel5x5.cpp b/src/runtime/CL/functions/CLSobel5x5.cpp
deleted file mode 100644
index 98f2157..0000000
--- a/src/runtime/CL/functions/CLSobel5x5.cpp
+++ /dev/null

@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/ITensorAllocator.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLSobel5x5Kernel.h"
-
-using namespace arm_compute;
-
-CLSobel5x5::CLSobel5x5(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)),
-      _sobel_hor(std::make_unique<CLSobel5x5HorKernel>()),
-      _sobel_vert(std::make_unique<CLSobel5x5VertKernel>()),
-      _border_handler(std::make_unique<CLFillBorderKernel>()),
-      _tmp_x(),
-      _tmp_y()
-{
-}
-
-CLSobel5x5::~CLSobel5x5() = default;
-
-void CLSobel5x5::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
-}
-
-void CLSobel5x5::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-
-    const bool run_sobel_x = output_x != nullptr;
-    const bool run_sobel_y = output_y != nullptr;
-
-    TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::S16);
-
-    if(run_sobel_x && run_sobel_y)
-    {
-        _tmp_x.allocator()->init(tensor_info);
-        _tmp_y.allocator()->init(tensor_info);
-        _memory_group.manage(&_tmp_x);
-        _memory_group.manage(&_tmp_y);
-        _sobel_hor->configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert->configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
-        _tmp_x.allocator()->allocate();
-        _tmp_y.allocator()->allocate();
-    }
-    else if(run_sobel_x)
-    {
-        _tmp_x.allocator()->init(tensor_info);
-        _memory_group.manage(&_tmp_x);
-        _sobel_hor->configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert->configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
-        _tmp_x.allocator()->allocate();
-    }
-    else if(run_sobel_y)
-    {
-        _tmp_y.allocator()->init(tensor_info);
-        _memory_group.manage(&_tmp_y);
-        _sobel_hor->configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert->configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
-        _tmp_y.allocator()->allocate();
-    }
-    _border_handler->configure(compile_context, input, _sobel_hor->border_size(), border_mode, PixelValue(constant_border_value));
-}
-
-void CLSobel5x5::run()
-{
-    CLScheduler::get().enqueue(*_border_handler, false);
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    CLScheduler::get().enqueue(*_sobel_hor, false);
-    CLScheduler::get().enqueue(*_sobel_vert);
-}

diff --git a/src/runtime/CL/functions/CLSobel7x7.cpp b/src/runtime/CL/functions/CLSobel7x7.cpp
deleted file mode 100644
index a3d63f9..0000000
--- a/src/runtime/CL/functions/CLSobel7x7.cpp
+++ /dev/null

@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/ITensorAllocator.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLSobel7x7Kernel.h"
-
-using namespace arm_compute;
-
-CLSobel7x7::CLSobel7x7(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)),
-      _sobel_hor(std::make_unique<CLSobel7x7HorKernel>()),
-      _sobel_vert(std::make_unique<CLSobel7x7VertKernel>()),
-      _border_handler(std::make_unique<CLFillBorderKernel>()),
-      _tmp_x(),
-      _tmp_y()
-{
-}
-
-CLSobel7x7::~CLSobel7x7() = default;
-
-void CLSobel7x7::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
-}
-
-void CLSobel7x7::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-
-    const bool run_sobel_x = output_x != nullptr;
-    const bool run_sobel_y = output_y != nullptr;
-
-    TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::S32);
-
-    if(run_sobel_x && run_sobel_y)
-    {
-        _tmp_x.allocator()->init(tensor_info);
-        _tmp_y.allocator()->init(tensor_info);
-        _memory_group.manage(&_tmp_x);
-        _memory_group.manage(&_tmp_y);
-        _sobel_hor->configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert->configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
-        _tmp_x.allocator()->allocate();
-        _tmp_y.allocator()->allocate();
-    }
-    else if(run_sobel_x)
-    {
-        _tmp_x.allocator()->init(tensor_info);
-        _memory_group.manage(&_tmp_x);
-        _sobel_hor->configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert->configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
-        _tmp_x.allocator()->allocate();
-    }
-    else if(run_sobel_y)
-    {
-        _tmp_y.allocator()->init(tensor_info);
-        _memory_group.manage(&_tmp_y);
-        _sobel_hor->configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert->configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
-        _tmp_y.allocator()->allocate();
-    }
-    _border_handler->configure(compile_context, input, _sobel_hor->border_size(), border_mode, PixelValue(constant_border_value));
-}
-
-void CLSobel7x7::run()
-{
-    CLScheduler::get().enqueue(*_border_handler, false);
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    CLScheduler::get().enqueue(*_sobel_hor, false);
-    CLScheduler::get().enqueue(*_sobel_vert);
-}

diff --git a/src/runtime/CL/functions/CLTableLookup.cpp b/src/runtime/CL/functions/CLTableLookup.cpp
deleted file mode 100644
index a4671f5..0000000
--- a/src/runtime/CL/functions/CLTableLookup.cpp
+++ /dev/null

@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLTableLookup.h"
-
-#include "src/core/CL/kernels/CLTableLookupKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLTableLookup::configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, lut, output);
-}
-
-void CLTableLookup::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLLut *lut, ICLTensor *output)
-{
-    auto k = std::make_unique<CLTableLookupKernel>();
-    k->configure(compile_context, input, lut, output);
-    _kernel = std::move(k);
-}

diff --git a/src/runtime/CL/functions/CLThreshold.cpp b/src/runtime/CL/functions/CLThreshold.cpp
deleted file mode 100644
index 70bc3b9..0000000
--- a/src/runtime/CL/functions/CLThreshold.cpp
+++ /dev/null

@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLThreshold.h"
-
-#include "src/core/CL/kernels/CLThresholdKernel.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-void CLThreshold::configure(const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
-}
-
-void CLThreshold::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info)
-{
-    auto k = std::make_unique<CLThresholdKernel>();
-    k->configure(compile_context, input, output, info);
-    _kernel = std::move(k);
-}
-} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLWarpAffine.cpp b/src/runtime/CL/functions/CLWarpAffine.cpp
deleted file mode 100644
index 9a22446..0000000
--- a/src/runtime/CL/functions/CLWarpAffine.cpp
+++ /dev/null

@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLWarpAffine.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLWarpAffineKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLWarpAffine::configure(ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, matrix, policy, border_mode, constant_border_value);
-}
-
-void CLWarpAffine::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode,
-                             uint8_t constant_border_value)
-{
-    auto k = std::make_unique<CLWarpAffineKernel>();
-    k->configure(compile_context, input, output, matrix, policy);
-    _kernel = std::move(k);
-    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}

diff --git a/src/runtime/CL/functions/CLWarpPerspective.cpp b/src/runtime/CL/functions/CLWarpPerspective.cpp
deleted file mode 100644
index 0ec6b42..0000000
--- a/src/runtime/CL/functions/CLWarpPerspective.cpp
+++ /dev/null

@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLWarpPerspective.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLWarpPerspectiveKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLWarpPerspective::configure(ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, matrix, policy, border_mode, constant_border_value);
-}
-
-void CLWarpPerspective::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode,
-                                  uint8_t constant_border_value)
-{
-    auto k = std::make_unique<CLWarpPerspectiveKernel>();
-    k->configure(compile_context, input, output, matrix, policy);
-    _kernel = std::move(k);
-    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}

diff --git a/src/runtime/NEON/functions/NEConvolution.cpp b/src/runtime/NEON/functions/NEConvolution.cpp
deleted file mode 100644
index ad62a22..0000000
--- a/src/runtime/NEON/functions/NEConvolution.cpp
+++ /dev/null

@@ -1,149 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEConvolution.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "src/core/NEON/kernels/NEConvolutionKernel.h"
-#include "src/core/NEON/kernels/NEConvolutionKernel.h"
-#include "src/core/NEON/kernels/NEFillBorderKernel.h"
-
-#include <array>
-#include <utility>
-
-namespace arm_compute
-{
-NEConvolution3x3::~NEConvolution3x3() = default;
-
-void NEConvolution3x3::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = std::make_unique<NEConvolution3x3Kernel>();
-    k->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-
-    auto b = std::make_unique<NEFillBorderKernel>();
-    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-    _border_handler = std::move(b);
-}
-
-template <unsigned int matrix_size>
-NEConvolutionSquare<matrix_size>::~NEConvolutionSquare() = default;
-
-template <unsigned int matrix_size>
-NEConvolutionSquare<matrix_size>::NEConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
-{
-}
-
-template <unsigned int matrix_size>
-void NEConvolutionSquare<matrix_size>::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode,
-                                                 uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON(conv == nullptr);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-
-    std::array<int16_t, matrix_size> conv_col{ { 0 } };
-    std::array<int16_t, matrix_size> conv_row{ { 0 } };
-
-    _is_separable = separate_matrix(conv, conv_col.data(), conv_row.data(), matrix_size);
-
-    auto b = std::make_unique<NEFillBorderKernel>();
-    if(_is_separable)
-    {
-        DataType intermediate_type = DataType::UNKNOWN;
-        std::tie(std::ignore, intermediate_type) = data_type_for_convolution(conv_col.data(), conv_row.data(), matrix_size);
-
-        _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, intermediate_type));
-
-        // Manage intermediate buffers
-        _memory_group.manage(&_tmp);
-
-        // Calculate scale
-        if(scale == 0)
-        {
-            scale = calculate_matrix_scale(conv, matrix_size);
-        }
-
-        _kernel_hor  = std::make_unique<NESeparableConvolutionHorKernel<matrix_size>>();
-        _kernel_vert = std::make_unique<NESeparableConvolutionVertKernel<matrix_size>>();
-
-        _kernel_hor->configure(input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED);
-        _kernel_vert->configure(&_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED);
-
-        _tmp.allocator()->allocate();
-
-        b->configure(input, _kernel_hor->border_size(), border_mode, PixelValue(constant_border_value));
-    }
-    else
-    {
-        _kernel = std::make_unique<NEConvolutionKernel<matrix_size>>();
-        _kernel->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
-        b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-    }
-    _border_handler = std::move(b);
-}
-
-template <unsigned int matrix_size>
-void                   NEConvolutionSquare<matrix_size>::run()
-{
-    NEScheduler::get().schedule(_border_handler.get(), Window::DimZ);
-
-    if(_is_separable)
-    {
-        MemoryGroupResourceScope scope_mg(_memory_group);
-
-        NEScheduler::get().schedule(_kernel_hor.get(), Window::DimY);
-        NEScheduler::get().schedule(_kernel_vert.get(), Window::DimY);
-    }
-    else
-    {
-        NEScheduler::get().schedule(_kernel.get(), Window::DimY);
-    }
-}
-
-template class arm_compute::NEConvolutionSquare<5>;
-template class arm_compute::NEConvolutionSquare<7>;
-template class arm_compute::NEConvolutionSquare<9>;
-
-NEConvolutionRectangle::~NEConvolutionRectangle() = default;
-
-void NEConvolutionRectangle::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
-{
-    border_mode = (border_mode == BorderMode::UNDEFINED) ? BorderMode::CONSTANT : border_mode;
-    auto k      = std::make_unique<NEConvolutionRectangleKernel>();
-    k->configure(input, output, conv, rows, cols, scale, false);
-    _kernel = std::move(k);
-
-    auto b = std::make_unique<NEFillBorderKernel>();
-    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-    _border_handler = std::move(b);
-}
-} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp b/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
deleted file mode 100644
index a34be71..0000000
--- a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
+++ /dev/null

@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h"
-
-#include "src/core/NEON/kernels/NEFillBorderKernel.h"
-#include "src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-void NENonMaximaSuppression3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode)
-{
-    auto k = std::make_unique<NENonMaximaSuppression3x3Kernel>();
-    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-
-    auto b = std::make_unique<NEFillBorderKernel>();
-    if(border_mode != BorderMode::UNDEFINED)
-    {
-        b->configure(input, BorderSize(1), BorderMode::CONSTANT, static_cast<float>(0.f));
-    }
-    else
-    {
-        b->configure(input, BorderSize(1), BorderMode::UNDEFINED, static_cast<float>(0.f));
-    }
-    _border_handler = std::move(b);
-}
-} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NERemap.cpp b/src/runtime/NEON/functions/NERemap.cpp
new file mode 100644
index 0000000..a55f7bc
--- /dev/null
+++ b/src/runtime/NEON/functions/NERemap.cpp

@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NERemap.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NERemapKernel.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void NERemap::configure(ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported");
+
+    auto k = std::make_unique<NERemapKernel>();
+    k->configure(input, map_x, map_y, output, policy);
+    _kernel = std::move(k);
+
+    auto b = std::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler = std::move(b);
+}
+} // namespace arm_compute

diff --git a/tests/validation/CL/AbsoluteDifference.cpp b/tests/validation/CL/AbsoluteDifference.cpp
deleted file mode 100644
index f3eb129..0000000
--- a/tests/validation/CL/AbsoluteDifference.cpp
+++ /dev/null

@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "arm_compute/runtime/CL/functions/CLAbsoluteDifference.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ConvertPolicyDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/AbsoluteDifferenceFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/** Input data sets **/
-const auto AbsoluteDifferenceU8Dataset = combine(combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U8)), framework::dataset::make("DataType",
-                                                 DataType::U8));
-const auto AbsoluteDifferenceS16Dataset = combine(combine(framework::dataset::make("DataType", { DataType::U8, DataType::S16 }), framework::dataset::make("DataType", DataType::S16)),
-                                                  framework::dataset::make("DataType", DataType::S16));
-} // namespace
-
-TEST_SUITE(CL)
-TEST_SUITE(AbsoluteDifference)
-
-template <typename T>
-using CLAbsoluteDifferenceFixture = AbsoluteDifferenceValidationFixture<CLTensor, CLAccessor, CLAbsoluteDifference, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLAbsoluteDifferenceFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::SmallShapes(), AbsoluteDifferenceU8Dataset))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLAbsoluteDifferenceFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), AbsoluteDifferenceU8Dataset))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLAbsoluteDifferenceFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(datasets::SmallShapes(), AbsoluteDifferenceS16Dataset))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLAbsoluteDifferenceFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), AbsoluteDifferenceS16Dataset))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-TEST_SUITE_END() // S16
-
-TEST_SUITE_END() // AbsoluteDifference
-TEST_SUITE_END() // CL
-
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/Accumulate.cpp b/tests/validation/CL/Accumulate.cpp
deleted file mode 100644
index 8f5c6d5..0000000
--- a/tests/validation/CL/Accumulate.cpp
+++ /dev/null

@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "arm_compute/runtime/CL/functions/CLAccumulate.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ConvertPolicyDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/AccumulateFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/** Tolerance value for comparing reference's output against implementation's output for floating point data types */
-constexpr AbsoluteTolerance<float> tolerance(1.0f);
-/** Input data sets **/
-const auto AccumulateU8Dataset  = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U8));
-const auto AccumulateS16Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::S16));
-} // namespace
-TEST_SUITE(CL)
-TEST_SUITE(Accumulate)
-
-TEST_SUITE(U8)
-template <typename T1>
-using CLAccumulateFixture = AccumulateValidationFixture<CLTensor, CLAccessor, CLAccumulate, T1, int16_t>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLAccumulateFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::SmallShapes(), AccumulateS16Dataset))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLAccumulateFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), AccumulateS16Dataset))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-
-TEST_SUITE(AccumulateWeighted)
-
-TEST_SUITE(U8)
-template <typename T1>
-using CLAccumulateWeightedFixture = AccumulateWeightedValidationFixture<CLTensor, CLAccessor, CLAccumulateWeighted, T1, uint8_t>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLAccumulateWeightedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::SmallShapes(), AccumulateU8Dataset))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLAccumulateWeightedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), AccumulateU8Dataset))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-
-TEST_SUITE(AccumulateSquared)
-
-TEST_SUITE(U8)
-template <typename T1>
-using CLAccumulateSquaredFixture = AccumulateSquaredValidationFixture<CLTensor, CLAccessor, CLAccumulateSquared, T1, int16_t>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLAccumulateSquaredFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::SmallShapes(), AccumulateS16Dataset))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLAccumulateSquaredFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), AccumulateS16Dataset))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/Box3x3.cpp b/tests/validation/CL/Box3x3.cpp
deleted file mode 100644
index 6fd531b..0000000
--- a/tests/validation/CL/Box3x3.cpp
+++ /dev/null

@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/functions/CLBox3x3.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/Box3x3Fixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr unsigned int filter_size = 3;              /* Size of the kernel/filter in number of elements. */
-constexpr BorderSize   border_size(filter_size / 2); /* Border size of the kernel/filter around its central element. */
-} // namespace
-
-TEST_SUITE(CL)
-TEST_SUITE(Box3x3)
-
-template <typename T>
-using CLBox3x3Fixture = Box3x3ValidationFixture<CLTensor, CLAccessor, CLBox3x3, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLBox3x3Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                            DataType::U8)),
-                                                                                                    datasets::BorderModes()))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLBox3x3Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                            DataType::U8)),
-                                                                                                    datasets::BorderModes()))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/CannyEdge.cpp b/tests/validation/CL/CannyEdge.cpp
deleted file mode 100644
index c127eac..0000000
--- a/tests/validation/CL/CannyEdge.cpp
+++ /dev/null

@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "arm_compute/runtime/CL/functions/CLCannyEdge.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/CL/CLArrayAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ImageFileDatasets.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/CannyEdgeFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/* Allowed ratio of mismatches between target and reference (1.0 = 100%) */
-const float allowed_mismatch_ratio = 0.1f;
-
-const auto data = combine(framework::dataset::make("GradientSize",
-{ 3, 5, 7 }),
-combine(framework::dataset::make("Normalization", { MagnitudeType::L1NORM, MagnitudeType::L2NORM }), datasets::BorderModes()));
-} // namespace
-
-TEST_SUITE(CL)
-TEST_SUITE(CannyEdge)
-
-template <typename T>
-using CLCannyEdgeFixture = CannyEdgeValidationFixture<CLTensor, CLAccessor, CLKeyPointArray, CLCannyEdge, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLCannyEdgeFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SmallImageFiles(), data), framework::dataset::make("Format", Format::U8)))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, AbsoluteTolerance<uint8_t>(0), allowed_mismatch_ratio);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLCannyEdgeFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeImageFiles(), data), framework::dataset::make("Format", Format::U8)))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, AbsoluteTolerance<uint8_t>(0), allowed_mismatch_ratio);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/ChannelCombine.cpp b/tests/validation/CL/ChannelCombine.cpp
deleted file mode 100644
index 2ed0765..0000000
--- a/tests/validation/CL/ChannelCombine.cpp
+++ /dev/null

@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLMultiImage.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "arm_compute/runtime/CL/functions/CLChannelCombine.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ConvertPolicyDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ChannelCombineFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(CL)
-TEST_SUITE(ChannelCombine)
-
-template <typename T>
-using CLChannelCombineFixture = ChannelCombineValidationFixture<CLMultiImage, CLTensor, CLAccessor, CLChannelCombine, T>;
-
-TEST_SUITE(RGBA)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLChannelCombineFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), framework::dataset::make("FormatType", { Format::RGB888, Format::RGBA8888 })))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
-    {
-        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLChannelCombineFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("FormatType", { Format::RGB888, Format::RGBA8888 })))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
-    {
-        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-TEST_SUITE_END() // RGBA
-
-TEST_SUITE(YUV)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLChannelCombineFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), framework::dataset::make("FormatType", { Format::YUYV422, Format::UYVY422 })))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
-    {
-        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLChannelCombineFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("FormatType", { Format::YUYV422, Format::UYVY422 })))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
-    {
-        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-TEST_SUITE_END() // YUV
-
-TEST_SUITE(YUVPlanar)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLChannelCombineFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), framework::dataset::make("FormatType", { Format::NV12, Format::NV21, Format::IYUV, Format::YUV444 })))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
-    {
-        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLChannelCombineFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("FormatType", { Format::NV12, Format::NV21, Format::IYUV, Format::YUV444 })))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
-    {
-        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-TEST_SUITE_END() // YUVPlanar
-
-TEST_SUITE_END() // ChannelCombine
-TEST_SUITE_END() // CL
-
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/ChannelExtract.cpp b/tests/validation/CL/ChannelExtract.cpp
deleted file mode 100644
index b02741f..0000000
--- a/tests/validation/CL/ChannelExtract.cpp
+++ /dev/null

@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLMultiImage.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "arm_compute/runtime/CL/functions/CLChannelExtract.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ConvertPolicyDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ChannelExtractFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-// Input data sets
-const auto ChannelExtractRGBADataset = combine(framework::dataset::make("FormatType", { Format::RGBA8888 }),
-                                               framework::dataset::make("ChannelType", { Channel::R, Channel::G, Channel::B, Channel::A }));
-const auto ChannelExtractYUVDataset = combine(framework::dataset::make("FormatType", { Format::YUYV422, Format::UYVY422 }),
-                                              framework::dataset::make("ChannelType", { Channel::Y, Channel::U, Channel::V }));
-const auto ChannelExtractYUVPlanarDataset = combine(framework::dataset::make("FormatType", { Format::IYUV, Format::YUV444, Format::NV12, Format::NV21 }),
-                                                    framework::dataset::make("ChannelType", { Channel::Y, Channel::U, Channel::V }));
-} // namespace
-
-TEST_SUITE(CL)
-TEST_SUITE(ChannelExtract)
-
-template <typename T>
-using CLChannelExtractFixture = ChannelExtractValidationFixture<CLMultiImage, CLTensor, CLAccessor, CLChannelExtract, T>;
-
-TEST_SUITE(RGBA)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLChannelExtractFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), ChannelExtractRGBADataset))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLChannelExtractFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ChannelExtractRGBADataset))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-TEST_SUITE_END() // RGBA
-
-TEST_SUITE(YUV)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLChannelExtractFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), ChannelExtractYUVDataset))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLChannelExtractFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ChannelExtractYUVDataset))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-TEST_SUITE_END() // YUV
-
-TEST_SUITE(YUVPlanar)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLChannelExtractFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), ChannelExtractYUVPlanarDataset))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLChannelExtractFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ChannelExtractYUVPlanarDataset))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-TEST_SUITE_END() // YUVPlanar
-
-TEST_SUITE_END() // ChannelExtract
-TEST_SUITE_END() // CL
-
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/ColorConvert.cpp b/tests/validation/CL/ColorConvert.cpp
deleted file mode 100644
index 0d672a0..0000000
--- a/tests/validation/CL/ColorConvert.cpp
+++ /dev/null

@@ -1,241 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/CLMultiImage.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "arm_compute/runtime/CL/functions/CLColorConvert.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ColorConvertFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr AbsoluteTolerance<uint8_t> tolerance_nv(2);
-constexpr AbsoluteTolerance<uint8_t> tolerance_u8(1);
-
-// Input data sets
-const auto RGBDataset  = framework::dataset::make("FormatType", { Format::RGB888, Format::RGBA8888 });
-const auto YUYVDataset = framework::dataset::make("FormatType", { Format::YUYV422, Format::UYVY422 });
-
-const auto ColorConvert_RGBA_to_RGB = combine(framework::dataset::make("FormatType", { Format::RGBA8888 }),
-                                              framework::dataset::make("FormatType", { Format::RGB888 }));
-
-const auto ColorConvert_RGB_to_RGBA = combine(framework::dataset::make("FormatType", { Format::RGB888 }),
-                                              framework::dataset::make("FormatType", { Format::RGBA8888 }));
-
-const auto ColorConvert_RGB_to_U8 = combine(framework::dataset::make("FormatType", { Format::RGB888 }),
-                                            framework::dataset::make("FormatType", { Format::U8 }));
-
-const auto ColorConvert_YUYV_to_RGBDataset = combine(YUYVDataset,
-                                                     RGBDataset);
-
-const auto ColorConvert_YUVPlanar_to_RGBDataset = combine(framework::dataset::make("FormatType", { Format::IYUV, Format::NV12, Format::NV21 }),
-                                                          RGBDataset);
-
-const auto ColorConvert_RGBDataset_to_NVDataset = combine(RGBDataset,
-                                                          framework::dataset::make("FormatType", { Format::NV12, Format::IYUV, Format::YUV444 }));
-
-const auto ColorConvert_YUYVDataset_to_NVDataset = combine(YUYVDataset,
-                                                           framework::dataset::make("FormatType", { Format::NV12, Format::IYUV }));
-
-const auto ColorConvert_NVDataset_to_YUVDataset = combine(framework::dataset::make("FormatType", { Format::NV12, Format::NV21 }),
-                                                          framework::dataset::make("FormatType", { Format::IYUV, Format::YUV444 }));
-
-} // namespace
-
-TEST_SUITE(CL)
-TEST_SUITE(ColorConvert)
-
-template <typename T>
-using CLColorConvertFixture = ColorConvertValidationFixture<CLMultiImage, CLTensor, CLAccessor, CLColorConvert, T>;
-
-TEST_SUITE(RGBA)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), ColorConvert_RGBA_to_RGB))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ColorConvert_RGBA_to_RGB))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-TEST_SUITE_END()
-
-TEST_SUITE(RGB)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), ColorConvert_RGB_to_RGBA))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ColorConvert_RGB_to_RGBA))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-TEST_SUITE_END()
-
-TEST_SUITE(RGBtoU8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), ColorConvert_RGB_to_U8))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx], tolerance_u8);
-    }
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ColorConvert_RGB_to_U8))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx], tolerance_u8);
-    }
-}
-TEST_SUITE_END()
-
-TEST_SUITE(YUV)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), ColorConvert_YUYV_to_RGBDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ColorConvert_YUYV_to_RGBDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-TEST_SUITE_END()
-
-TEST_SUITE(YUVPlanar)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), ColorConvert_YUVPlanar_to_RGBDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ColorConvert_YUVPlanar_to_RGBDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-TEST_SUITE_END()
-
-TEST_SUITE(NV)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), ColorConvert_RGBDataset_to_NVDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx], tolerance_nv);
-    }
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ColorConvert_RGBDataset_to_NVDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx], tolerance_nv);
-    }
-}
-TEST_SUITE_END()
-
-TEST_SUITE(YUYVtoNV)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), ColorConvert_YUYVDataset_to_NVDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ColorConvert_YUYVDataset_to_NVDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-
-TEST_SUITE_END()
-
-TEST_SUITE(NVtoYUV)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), ColorConvert_NVDataset_to_YUVDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ColorConvert_NVDataset_to_YUVDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-
-TEST_SUITE_END()
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/Convolution.cpp b/tests/validation/CL/Convolution.cpp
deleted file mode 100644
index 1608e7c..0000000
--- a/tests/validation/CL/Convolution.cpp
+++ /dev/null

@@ -1,285 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/functions/CLConvolution.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ConvolutionFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(CL)
-TEST_SUITE(CustomConvolution)
-TEST_SUITE(Square3x3)
-template <typename T>
-using CLConvolutionFixture = ConvolutionSquareValidationFixture<CLTensor, CLAccessor, CLConvolution3x3, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, CLConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 3 })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)));
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, CLConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 3 })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)));
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Square 3x3
-
-TEST_SUITE(Square5x5)
-template <typename T>
-using CLConvolutionFixture = ConvolutionSquareValidationFixture<CLTensor, CLAccessor, CLConvolution5x5, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, CLConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 5 })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)));
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, CLConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 5 })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)));
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Square5x5
-
-TEST_SUITE(Square7x7)
-template <typename T>
-using CLConvolutionFixture = ConvolutionSquareValidationFixture<CLTensor, CLAccessor, CLConvolution7x7, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, CLConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 7 })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)));
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, CLConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 7 })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)));
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Square7x7
-
-TEST_SUITE(Square9x9)
-
-template <typename T>
-using CLConvolutionFixture = ConvolutionSquareValidationFixture<CLTensor, CLAccessor, CLConvolution9x9, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, CLConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 9 })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)));
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, CLConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 9 })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)));
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Square9x9
-
-TEST_SUITE(Rectangle)
-template <typename T>
-using CLConvolutionFixture = ConvolutionRectangleValidationFixture<CLTensor, CLAccessor, CLConvolutionRectangle, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, CLConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                                 framework::dataset::make("filter_width", { 3, 5, 7, 9 })),
-                                                                                                         framework::dataset::make("filter_height", { 3, 5, 7, 9 })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)));
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, CLConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                                 framework::dataset::make("filter_width", { 3, 5, 7, 9 })),
-                                                                                                         framework::dataset::make("filter_height", { 3, 5, 7, 9 })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)));
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Rectangle
-
-TEST_SUITE(Separable5x5)
-template <typename T>
-using CLConvolutionFixture = ConvolutionSeparableValidationFixture<CLTensor, CLAccessor, CLConvolution5x5, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, CLConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 5 })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)));
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, CLConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 5 })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)));
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Separable5x5
-
-TEST_SUITE(Separable7x7)
-template <typename T>
-using CLConvolutionFixture = ConvolutionSeparableValidationFixture<CLTensor, CLAccessor, CLConvolution7x7, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, CLConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 7 })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)));
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, CLConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 7 })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)));
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Separable7x7
-
-TEST_SUITE(Separable9x9)
-template <typename T>
-using CLConvolutionFixture = ConvolutionSeparableValidationFixture<CLTensor, CLAccessor, CLConvolution9x9, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, CLConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 9 })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)));
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, CLConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 9 })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)));
-}
-TEST_SUITE_END()
-TEST_SUITE_END() // Separable9x9
-
-TEST_SUITE_END() // Custom Convolution
-TEST_SUITE_END() // CL
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/Derivative.cpp b/tests/validation/CL/Derivative.cpp
deleted file mode 100644
index b8f6856..0000000
--- a/tests/validation/CL/Derivative.cpp
+++ /dev/null

@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLDerivative.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/GradientDimensionDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/DerivativeFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(CL)
-TEST_SUITE(Derivative)
-
-using CLDerivativeFixture = DerivativeValidationFixture<CLTensor, CLAccessor, CLDerivative, uint8_t, int16_t>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLDerivativeFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                       Format::U8)),
-                                                                                               datasets::GradientDimensions()))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(CLAccessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(CLAccessor(_target.second), _reference.second, valid_region_y);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLDerivativeFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                       Format::U8)),
-                                                                                               datasets::GradientDimensions()))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(CLAccessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(CLAccessor(_target.second), _reference.second, valid_region_y);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/Dilate.cpp b/tests/validation/CL/Dilate.cpp
deleted file mode 100644
index c5fdb3f..0000000
--- a/tests/validation/CL/Dilate.cpp
+++ /dev/null

@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/functions/CLDilate.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/DilateFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr unsigned int filter_size = 3;              /* Size of the kernel/filter in number of elements. */
-constexpr BorderSize   border_size(filter_size / 2); /* Border size of the kernel/filter around its central element. */
-} // namespace
-
-TEST_SUITE(CL)
-TEST_SUITE(Dilate)
-
-template <typename T>
-using CLDilateFixture = DilateValidationFixture<CLTensor, CLAccessor, CLDilate, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLDilateFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                            DataType::U8)),
-                                                                                                    datasets::BorderModes()))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLDilateFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                            DataType::U8)),
-                                                                                                    datasets::BorderModes()))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/EqualizeHistogram.cpp b/tests/validation/CL/EqualizeHistogram.cpp
deleted file mode 100644
index 3585825..0000000
--- a/tests/validation/CL/EqualizeHistogram.cpp
+++ /dev/null

@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLEqualizeHistogram.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/EqualizeHistogramFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(CL)
-TEST_SUITE(EqualizeHistogram)
-
-template <typename T>
-using CLEqualizeHistogramFixture = EqualizeHistogramValidationFixture<CLTensor, CLAccessor, CLEqualizeHistogram, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLEqualizeHistogramFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
-                                                                                                               DataType::U8)))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLEqualizeHistogramFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("DataType",
-                                                                                                               DataType::U8)))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/Erode.cpp b/tests/validation/CL/Erode.cpp
deleted file mode 100644
index dd75b59..0000000
--- a/tests/validation/CL/Erode.cpp
+++ /dev/null

@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/functions/CLErode.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ErodeFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr unsigned int filter_size = 3;              /* Size of the kernel/filter in number of elements. */
-constexpr BorderSize   border_size(filter_size / 2); /* Border size of the kernel/filter around its central element. */
-} // namespace
-
-TEST_SUITE(CL)
-TEST_SUITE(Erode)
-
-template <typename T>
-using CLErodeFixture = ErodeValidationFixture<CLTensor, CLAccessor, CLErode, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLErodeFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                           DataType::U8)),
-                                                                                                   datasets::BorderModes()))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLErodeFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                           DataType::U8)),
-                                                                                                   datasets::BorderModes()))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/FastCorners.cpp b/tests/validation/CL/FastCorners.cpp
deleted file mode 100644
index 40a511e..0000000
--- a/tests/validation/CL/FastCorners.cpp
+++ /dev/null

@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/functions/CLFastCorners.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/CL/CLArrayAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ImageFileDatasets.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/FastCornersFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/* Tolerance used to compare corner strengths */
-const AbsoluteTolerance<float> tolerance(0.5f);
-} // namespace
-
-TEST_SUITE(CL)
-TEST_SUITE(FastCorners)
-
-template <typename T>
-using CLFastCornersFixture = FastCornersValidationFixture<CLTensor, CLAccessor, CLKeyPointArray, CLFastCorners, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLFastCornersFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SmallImageFiles(), framework::dataset::make("Format", Format::U8)),
-                                                                                                                 framework::dataset::make("SuppressNonMax", { false, true })),
-                                                                                                         framework::dataset::make("BorderMode", BorderMode::UNDEFINED)))
-{
-    // Validate output
-    CLArrayAccessor<KeyPoint> array(_target);
-    validate_keypoints(array.buffer(), array.buffer() + array.num_values(), _reference.begin(), _reference.end(), tolerance);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLFastCornersFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeImageFiles(), framework::dataset::make("Format", Format::U8)),
-                                                                                                                 framework::dataset::make("SuppressNonMax", { false, true })),
-                                                                                                         framework::dataset::make("BorderMode", BorderMode::UNDEFINED)))
-{
-    // Validate output
-    CLArrayAccessor<KeyPoint> array(_target);
-    validate_keypoints(array.buffer(), array.buffer() + array.num_values(), _reference.begin(), _reference.end(), tolerance);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/Gaussian3x3.cpp b/tests/validation/CL/Gaussian3x3.cpp
deleted file mode 100644
index b7672bd..0000000
--- a/tests/validation/CL/Gaussian3x3.cpp
+++ /dev/null

@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/functions/CLGaussian3x3.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/Gaussian3x3Fixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr unsigned int filter_size = 3;              /* Size of the kernel/filter in number of elements. */
-constexpr BorderSize   border_size(filter_size / 2); /* Border size of the kernel/filter around its central element. */
-} // namespace
-
-TEST_SUITE(CL)
-TEST_SUITE(Gaussian3x3)
-
-template <typename T>
-using CLGaussian3x3Fixture = Gaussian3x3ValidationFixture<CLTensor, CLAccessor, CLGaussian3x3, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGaussian3x3Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                                 DataType::U8)),
-                                                                                                         datasets::BorderModes()))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLGaussian3x3Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                                 DataType::U8)),
-                                                                                                         datasets::BorderModes()))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/Gaussian5x5.cpp b/tests/validation/CL/Gaussian5x5.cpp
deleted file mode 100644
index f2a1a30..0000000
--- a/tests/validation/CL/Gaussian5x5.cpp
+++ /dev/null

@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/Gaussian5x5Fixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr unsigned int filter_size = 5;              /* Size of the kernel/filter in number of elements. */
-constexpr BorderSize   border_size(filter_size / 2); /* Border size of the kernel/filter around its central element. */
-} // namespace
-
-TEST_SUITE(CL)
-TEST_SUITE(Gaussian5x5)
-
-template <typename T>
-using CLGaussian5x5Fixture = Gaussian5x5ValidationFixture<CLTensor, CLAccessor, CLGaussian5x5, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGaussian5x5Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                                 DataType::U8)),
-                                                                                                         datasets::BorderModes()))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLGaussian5x5Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                                 DataType::U8)),
-                                                                                                         datasets::BorderModes()))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/GaussianPyramid.cpp b/tests/validation/CL/GaussianPyramid.cpp
deleted file mode 100644
index d29f675..0000000
--- a/tests/validation/CL/GaussianPyramid.cpp
+++ /dev/null

@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/GaussianPyramidHalfFixture.h"
-#include "tests/validation/reference/Utils.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-const auto small_gaussian_pyramid_levels = combine(datasets::Medium2DShapes(), datasets::BorderModes()) * framework::dataset::make("numlevels", 2, 4);
-const auto large_gaussian_pyramid_levels = combine(datasets::Large2DShapes(), datasets::BorderModes()) * framework::dataset::make("numlevels", 2, 5);
-
-template <typename T>
-inline void validate_gaussian_pyramid(const CLPyramid &target, const std::vector<SimpleTensor<T>> &reference, BorderMode border_mode)
-{
-    ValidRegion prev_valid_region = shape_to_valid_region(reference[0].shape());
-
-    for(size_t i = 1; i < reference.size(); ++i)
-    {
-        const ValidRegion valid_region = shape_to_valid_region_gaussian_pyramid_half(reference[i - 1].shape(), prev_valid_region, (border_mode == BorderMode::UNDEFINED));
-
-        // Validate outputs
-        validate(CLAccessor(*(target.get_pyramid_level(i))), reference[i], valid_region);
-
-        // Keep the valid region for the next level
-        prev_valid_region = valid_region;
-    }
-}
-
-} // namespace
-
-TEST_SUITE(CL)
-TEST_SUITE(GaussianPyramid)
-TEST_SUITE(Half)
-template <typename T>
-using CLGaussianPyramidHalfFixture = GaussianPyramidHalfValidationFixture<CLTensor, CLAccessor, CLGaussianPyramidHalf, T, CLPyramid>;
-
-FIXTURE_DATA_TEST_CASE(RunSmallGaussianPyramidHalf, CLGaussianPyramidHalfFixture<uint8_t>, framework::DatasetMode::NIGHTLY, small_gaussian_pyramid_levels)
-{
-    validate_gaussian_pyramid(_target, _reference, _border_mode);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLargeGaussianPyramidHalf, CLGaussianPyramidHalfFixture<uint8_t>, framework::DatasetMode::NIGHTLY, large_gaussian_pyramid_levels)
-{
-    validate_gaussian_pyramid(_target, _reference, _border_mode);
-}
-TEST_SUITE_END()
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/HOGDescriptor.cpp b/tests/validation/CL/HOGDescriptor.cpp
deleted file mode 100644
index a73e563..0000000
--- a/tests/validation/CL/HOGDescriptor.cpp
+++ /dev/null

@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/CLHOG.h"
-#include "arm_compute/runtime/CL/functions/CLHOGDescriptor.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/HOGDescriptorDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/HOGDescriptorFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-RelativeTolerance<float> tolerance(0.001f);
-} // namespace
-
-TEST_SUITE(CL)
-TEST_SUITE(HOGDescriptor)
-
-using CLHOGDescriptorFixture = HOGDescriptorValidationFixture<CLTensor, CLHOG, CLAccessor, CLHOGDescriptor, uint8_t, float>;
-
-// *INDENT-OFF*
-// clang-format off
-FIXTURE_DATA_TEST_CASE(RunSmall, CLHOGDescriptorFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(
-                       datasets::SmallHOGDescriptorDataset(),
-                       framework::dataset::make("Format", Format::U8)),
-                       framework::dataset::make("BorderMode", {BorderMode::CONSTANT, BorderMode::REPLICATE})))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLHOGDescriptorFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(
-                       datasets::LargeHOGDescriptorDataset(),
-                       framework::dataset::make("Format", Format::U8)),
-                       framework::dataset::make("BorderMode", {BorderMode::CONSTANT, BorderMode::REPLICATE})))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance);
-}
-// clang-format on
-// *INDENT-ON*
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/HOGDetector.cpp b/tests/validation/CL/HOGDetector.cpp
deleted file mode 100644
index 2d1904f..0000000
--- a/tests/validation/CL/HOGDetector.cpp
+++ /dev/null

@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/CLArray.h"
-#include "arm_compute/runtime/CL/functions/CLHOGDescriptor.h"
-#include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/CL/CLArrayAccessor.h"
-#include "tests/CL/CLHOGAccessor.h"
-#include "tests/datasets/HOGDescriptorDataset.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/HOGDetectorFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/* Set the tolerance (percentage) used when validating the score of detection window. */
-RelativeTolerance<float> tolerance(0.01f);
-
-/* Input dataset (values must be a multiple of the HOGInfo block_size) */
-const auto DetectionWindowStrideDataset = framework::dataset::make("DetectionWindowStride", { Size2D(8, 8), Size2D(16, 16) });
-} // namespace
-
-TEST_SUITE(CL)
-TEST_SUITE(HOGDetector)
-
-// *INDENT-OFF*
-// clang-format off
-using CLHOGDetectorFixture = HOGDetectorValidationFixture<CLTensor,
-                                                          CLHOG,
-                                                          CLDetectionWindowArray,
-                                                          CLHOGDescriptor,
-                                                          CLAccessor,
-                                                          CLArrayAccessor<DetectionWindow>,
-                                                          CLHOGAccessor,
-                                                          CLHOGDetector,
-                                                          uint8_t,
-                                                          float>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLHOGDetectorFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(
-                       DetectionWindowStrideDataset,
-                       datasets::SmallHOGDescriptorDataset()),
-                       framework::dataset::make("Format", Format::U8)),
-                       framework::dataset::make("BorderMode", {BorderMode::CONSTANT, BorderMode::REPLICATE})))
-
-{
-    // Validate output
-    validate_detection_windows(_target.begin(), _target.end(), _reference.begin(), _reference.end(), tolerance);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLHOGDetectorFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(
-                       DetectionWindowStrideDataset,
-                       datasets::LargeHOGDescriptorDataset()),
-                       framework::dataset::make("Format", Format::U8)),
-                       framework::dataset::make("BorderMode", {BorderMode::CONSTANT, BorderMode::REPLICATE})))
-{
-    // Validate output
-    validate_detection_windows(_target.begin(), _target.end(), _reference.begin(), _reference.end(), tolerance);
-}
-
-// clang-format on
-// *INDENT-ON*
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/HOGMultiDetection.cpp b/tests/validation/CL/HOGMultiDetection.cpp
deleted file mode 100644
index 4ca1dab..0000000
--- a/tests/validation/CL/HOGMultiDetection.cpp
+++ /dev/null

@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/CLMultiHOG.h"
-#include "arm_compute/runtime/CL/functions/CLHOGDescriptor.h"
-#include "arm_compute/runtime/CL/functions/CLHOGMultiDetection.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/CL/CLArrayAccessor.h"
-#include "tests/CL/CLHOGAccessor.h"
-#include "tests/datasets/HOGMultiDetectionDataset.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/HOGMultiDetectionFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/* Set the tolerance (percentage) used when validating the strength of detection window. */
-RelativeTolerance<float> tolerance(0.1f);
-} // namespace
-
-TEST_SUITE(CL)
-TEST_SUITE(HOGMultiDetection)
-
-// *INDENT-OFF*
-// clang-format off
-using CLHOGMultiDetectionFixture = HOGMultiDetectionValidationFixture<CLTensor,
-                                                                      CLHOG,
-                                                                      CLMultiHOG,
-                                                                      CLDetectionWindowArray,
-                                                                      CLSize2DArray,
-                                                                      CLAccessor,
-                                                                      CLArrayAccessor<Size2D>,
-                                                                      CLArrayAccessor<DetectionWindow>,
-                                                                      CLHOGAccessor,
-                                                                      CLHOGMultiDetection,
-                                                                      uint8_t,
-                                                                      float>;
-
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLHOGMultiDetectionFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(
-                       datasets::SmallHOGMultiDetectionDataset(),
-                       framework::dataset::make("Format", Format::U8)),
-                       framework::dataset::make("BorderMode", {BorderMode::CONSTANT, BorderMode::REPLICATE})),
-                       framework::dataset::make("NonMaximaSuppression", {false, true})))
-{
-    // Validate output
-    validate_detection_windows(_target.begin(), _target.end(), _reference.begin(), _reference.end(), tolerance);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLHOGMultiDetectionFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(
-                       datasets::LargeHOGMultiDetectionDataset(),
-                       framework::dataset::make("Format", Format::U8)),
-                       framework::dataset::make("BorderMode", {BorderMode::CONSTANT, BorderMode::REPLICATE})),
-                       framework::dataset::make("NonMaximaSuppression", {false, true})))
-{
-    // Validate output
-    validate_detection_windows(_target.begin(), _target.end(), _reference.begin(), _reference.end(), tolerance);
-}
-
-// clang-format on
-// *INDENT-ON*
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/HarrisCorners.cpp b/tests/validation/CL/HarrisCorners.cpp
deleted file mode 100644
index 157102f..0000000
--- a/tests/validation/CL/HarrisCorners.cpp
+++ /dev/null

@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLArray.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "arm_compute/runtime/CL/functions/CLHarrisCorners.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/CL/CLArrayAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ImageFileDatasets.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/HarrisCornersFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-const auto data_nightly   = combine(framework::dataset::make("GradientSize", { 3, 5, 7 }), combine(framework::dataset::make("BlockSize", { 3, 5, 7 }), datasets::BorderModes()));
-const auto data_precommit = combine(framework::dataset::make("GradientSize", { 3 }), combine(framework::dataset::make("BlockSize", { 3 }), datasets::BorderModes()));
-} // namespace
-
-TEST_SUITE(CL)
-TEST_SUITE(HarrisCorners)
-
-template <typename T>
-using CLHarrisCornersFixture = HarrisCornersValidationFixture<CLTensor, CLAccessor, CLKeyPointArray, CLHarrisCorners, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLHarrisCornersFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SmallImageFiles(), data_precommit), framework::dataset::make("Format",
-                                                                                                           Format::U8)))
-{
-    // Validate output
-    CLArrayAccessor<KeyPoint> array(_target);
-    validate_keypoints(array.buffer(), array.buffer() + array.num_values(), _reference.begin(), _reference.end(), RelativeTolerance<float>(0.0001f));
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLHarrisCornersFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeImageFiles(), data_nightly), framework::dataset::make("Format",
-                                                                                                           Format::U8)))
-{
-    // Validate output
-    CLArrayAccessor<KeyPoint> array(_target);
-    validate_keypoints(array.buffer(), array.buffer() + array.num_values(), _reference.begin(), _reference.end(), RelativeTolerance<float>(0.0001f));
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/Histogram.cpp b/tests/validation/CL/Histogram.cpp
deleted file mode 100644
index 2619a00..0000000
--- a/tests/validation/CL/Histogram.cpp
+++ /dev/null

@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLDistribution1D.h"
-#include "arm_compute/runtime/CL/functions/CLHistogram.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/HistogramFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(CL)
-TEST_SUITE(Histogram)
-
-template <typename T>
-using CLHistogramFixture = HistogramValidationFixture<CLTensor, CLAccessor, CLHistogram, T, CLDistribution1D>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLHistogramFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
-                                                                                                       DataType::U8)))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLHistogramFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("DataType",
-                                                                                                       DataType::U8)))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/IntegralImage.cpp b/tests/validation/CL/IntegralImage.cpp
deleted file mode 100644
index e3b728a..0000000
--- a/tests/validation/CL/IntegralImage.cpp
+++ /dev/null

@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLIntegralImage.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/IntegralImageFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(CL)
-TEST_SUITE(IntegralImage)
-template <typename T>
-using CLIntegralImageFixture = IntegralImageValidationFixture<CLTensor, CLAccessor, CLIntegralImage, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLIntegralImageFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                           DataType::U8)))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLIntegralImageFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                           DataType::U8)))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/LaplacianPyramid.cpp b/tests/validation/CL/LaplacianPyramid.cpp
deleted file mode 100644
index 801115e..0000000
--- a/tests/validation/CL/LaplacianPyramid.cpp
+++ /dev/null

@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLPyramid.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "arm_compute/runtime/CL/functions/CLLaplacianPyramid.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/LaplacianPyramidFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/* Absolute tolerance value for comparing reference's output against implementation's output for DataType::S16
- * Tolerance is needed for calculation uncertainties introduced from the layers
- */
-AbsoluteTolerance<int16_t> tolerance_int16(1);
-const auto                 small_laplacian_pyramid_levels = framework::dataset::make("NumLevels", 2, 3);
-const auto                 large_laplacian_pyramid_levels = framework::dataset::make("NumLevels", 2, 5);
-
-const auto formats = combine(framework::dataset::make("FormatIn", Format::U8), framework::dataset::make("FormatOut", Format::S16));
-
-template <typename T>
-inline void validate_laplacian_pyramid(const CLPyramid &target, const std::vector<SimpleTensor<T>> &reference, BorderMode border_mode)
-{
-    CLTensor   *level_image  = target.get_pyramid_level(0);
-    ValidRegion valid_region = shape_to_valid_region(reference[0].shape(), border_mode == BorderMode::UNDEFINED, BorderSize(2));
-
-    // Validate lowest level
-    validate(CLAccessor(*level_image), reference[0], valid_region);
-
-    // Validate remaining levels
-    for(size_t lev = 1; lev < target.info()->num_levels(); lev++)
-    {
-        level_image                = target.get_pyramid_level(lev);
-        CLTensor *prev_level_image = target.get_pyramid_level(lev - 1);
-
-        valid_region = shape_to_valid_region_laplacian_pyramid(prev_level_image->info()->tensor_shape(),
-                                                               prev_level_image->info()->valid_region(),
-                                                               border_mode == BorderMode::UNDEFINED);
-
-        // Validate level
-        validate(CLAccessor(*level_image), reference[lev], valid_region, tolerance_int16);
-    }
-}
-} // namespace
-
-TEST_SUITE(CL)
-TEST_SUITE(LaplacianPyramid)
-
-// *INDENT-OFF*
-// clang-format off
-
-using CLLaplacianPyramidFixture = LaplacianPyramidValidationFixture<CLTensor, CLAccessor, CLLaplacianPyramid, uint8_t, int16_t, CLPyramid>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLLaplacianPyramidFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(
-                       datasets::Medium2DShapes(),
-                       datasets::BorderModes()),
-                       small_laplacian_pyramid_levels),
-                       formats))
-{
-    validate_laplacian_pyramid(_target, _reference, _border_mode);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLLaplacianPyramidFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(
-                       datasets::Large2DShapes(),
-                       datasets::BorderModes()),
-                       large_laplacian_pyramid_levels),
-                       formats))
-{
-    validate_laplacian_pyramid(_target, _reference, _border_mode);
-}
-// clang-format on
-// *INDENT-ON*
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/LaplacianReconstruct.cpp b/tests/validation/CL/LaplacianReconstruct.cpp
deleted file mode 100644
index e6e32ce..0000000
--- a/tests/validation/CL/LaplacianReconstruct.cpp
+++ /dev/null

@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLPyramid.h"
-#include "arm_compute/runtime/CL/functions/CLLaplacianPyramid.h"
-#include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/LaplacianReconstructFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-const auto small_laplacian_reconstruct_levels = framework::dataset::make("NumLevels", 2, 3);
-const auto large_laplacian_reconstruct_levels = framework::dataset::make("NumLevels", 2, 5);
-
-const auto formats = combine(framework::dataset::make("FormatIn", Format::S16), framework::dataset::make("FormatOut", Format::U8));
-
-template <typename T>
-void validate_laplacian_reconstruct(CLTensor &target, const SimpleTensor<T> &reference, BorderMode border_mode, size_t num_levels)
-{
-    const unsigned int filter_size = 5;
-    const unsigned int border_size(filter_size / 2);
-
-    BorderSize border(std::pow(border_size, num_levels));
-
-    // Validate output
-    ValidRegion valid_region = shape_to_valid_region(reference.shape(), border_mode == BorderMode::UNDEFINED, border);
-    validate(CLAccessor(target), reference, valid_region);
-}
-} // namespace
-
-TEST_SUITE(CL)
-TEST_SUITE(LaplacianReconstruct)
-
-// *INDENT-OFF*
-// clang-format off
-
-using CLLaplacianReconstructFixture = LaplacianReconstructValidationFixture<CLTensor, CLAccessor, CLLaplacianReconstruct, CLLaplacianPyramid, int16_t, uint8_t, CLPyramid>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLLaplacianReconstructFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(
-                       datasets::Medium2DShapes(),
-                       datasets::BorderModes()),
-                       small_laplacian_reconstruct_levels),
-                       formats))
-{
-    validate_laplacian_reconstruct(_target, _reference, _border_mode, _pyramid_levels);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLLaplacianReconstructFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(
-                       datasets::Large2DShapes(),
-                       datasets::BorderModes()),
-                       large_laplacian_reconstruct_levels),
-                       formats))
-{
-    validate_laplacian_reconstruct(_target, _reference, _border_mode, _pyramid_levels);
-}
-// clang-format on
-// *INDENT-ON*
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/Magnitude.cpp b/tests/validation/CL/Magnitude.cpp
deleted file mode 100644
index bf5879b..0000000
--- a/tests/validation/CL/Magnitude.cpp
+++ /dev/null

@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLMagnitude.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/MagnitudeFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-template <typename T>
-AbsoluteTolerance<T> tolerance(MagnitudeType magnitude_type)
-{
-    return AbsoluteTolerance<T>((MagnitudeType::L1NORM == magnitude_type) ? 0 : 1);
-}
-} // namespace
-
-TEST_SUITE(CL)
-TEST_SUITE(Magnitude)
-
-template <typename T>
-using CLMagnitudeFixture = MagnitudeValidationFixture<CLTensor, CLAccessor, CLMagnitude, T>;
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLMagnitudeFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SmallShapes(), framework::dataset::make("Format", Format::S16)),
-                                                                                                       framework::dataset::make("MagnitudeType", { MagnitudeType::L1NORM, MagnitudeType::L2NORM })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance<int16_t>(_magnitude_type));
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLMagnitudeFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::Large2DShapes(), framework::dataset::make("Format", Format::S16)),
-                                                                                                       framework::dataset::make("MagnitudeType", { MagnitudeType::L1NORM, MagnitudeType::L2NORM })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance<int16_t>(_magnitude_type));
-}
-TEST_SUITE_END() // S16
-
-TEST_SUITE(S32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLMagnitudeFixture<int32_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SmallShapes(), framework::dataset::make("Format", Format::S32)),
-                                                                                                       framework::dataset::make("MagnitudeType", { MagnitudeType::L1NORM, MagnitudeType::L2NORM })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance<int32_t>(_magnitude_type));
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLMagnitudeFixture<int32_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::Large2DShapes(), framework::dataset::make("Format", Format::S32)),
-                                                                                                       framework::dataset::make("MagnitudeType", { MagnitudeType::L1NORM, MagnitudeType::L2NORM })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance<int32_t>(_magnitude_type));
-}
-TEST_SUITE_END() // S32
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/MeanStdDev.cpp b/tests/validation/CL/MeanStdDev.cpp
deleted file mode 100644
index 0e5135e..0000000
--- a/tests/validation/CL/MeanStdDev.cpp
+++ /dev/null

@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLMeanStdDev.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/MeanStdDevFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-RelativeTolerance<float> tolerance_rel_high_error(0.05f);
-RelativeTolerance<float> tolerance_rel_low_error(0.0005f);
-AbsoluteTolerance<float> tolerance_rel_high_error_f32(0.01f);
-AbsoluteTolerance<float> tolerance_rel_low_error_f32(0.00001f);
-AbsoluteTolerance<float> tolerance_rel_high_error_f16(0.1f);
-AbsoluteTolerance<float> tolerance_rel_low_error_f16(0.01f);
-} // namespace
-
-TEST_SUITE(CL)
-TEST_SUITE(MeanStdDev)
-
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
-               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(16U, 16U), 1, DataType::F32),    // Wrong input data type
-                                                       TensorInfo(TensorShape(16U, 5U, 16U), 1, DataType::U8), // Invalid shape
-                                                       TensorInfo(TensorShape(16U, 16U), 1, DataType::U8),     // Valid
-                                                     }),
-               framework::dataset::make("Expected", { false, false, true })),
-               input_info, expected)
-{
-    ARM_COMPUTE_EXPECT(bool(CLMeanStdDev::validate(&input_info.clone()->set_is_resizable(false), nullptr, nullptr)) == expected, framework::LogLevel::ERRORS);
-}
-// clang-format on
-// *INDENT-ON*
-
-template <typename T>
-using CLMeanStdDevFixture = MeanStdDevValidationFixture<CLTensor, CLAccessor, CLMeanStdDev, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLMeanStdDevFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
-                                                                                                        DataType::U8)))
-{
-    // Validate mean output
-    validate(_target.first, _reference.first);
-
-    // Validate std_dev output
-    validate(_target.second, _reference.second, tolerance_rel_high_error);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLMeanStdDevFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("DataType",
-                                                                                                        DataType::U8)))
-{
-    // Validate mean output
-    validate(_target.first, _reference.first, tolerance_rel_low_error);
-
-    // Validate std_dev output
-    validate(_target.second, _reference.second, tolerance_rel_high_error);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(F16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLMeanStdDevFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
-                                                                                                     DataType::F16)))
-{
-    // Validate mean output
-    validate(_target.first, _reference.first, tolerance_rel_low_error_f16);
-
-    // Validate std_dev output
-    validate(_target.second, _reference.second, tolerance_rel_high_error_f16);
-}
-TEST_SUITE_END() // F16
-
-TEST_SUITE(F32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLMeanStdDevFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
-                                                                                                      DataType::F32)))
-{
-    // Validate mean output
-    validate(_target.first, _reference.first, tolerance_rel_low_error_f32);
-
-    // Validate std_dev output
-    validate(_target.second, _reference.second, tolerance_rel_high_error_f32);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLMeanStdDevFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("DataType",
-                                                                                                      DataType::F32)))
-{
-    // Validate mean output
-    validate(_target.first, _reference.first, tolerance_rel_low_error_f32);
-
-    // Validate std_dev output
-    validate(_target.second, _reference.second, tolerance_rel_high_error_f32);
-}
-TEST_SUITE_END() // F32
-
-TEST_SUITE_END() // MeanStdDev
-TEST_SUITE_END() // CL
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/Median3x3.cpp b/tests/validation/CL/Median3x3.cpp
deleted file mode 100644
index 9a09ae5..0000000
--- a/tests/validation/CL/Median3x3.cpp
+++ /dev/null

@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/functions/CLMedian3x3.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/Median3x3Fixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr unsigned int filter_size = 3;              /* Size of the kernel/filter in number of elements. */
-constexpr BorderSize   border_size(filter_size / 2); /* Border size of the kernel/filter around its central element. */
-} // namespace
-
-TEST_SUITE(CL)
-TEST_SUITE(Median3x3)
-template <typename T>
-using CLMedian3x3Fixture = Median3x3ValidationFixture<CLTensor, CLAccessor, CLMedian3x3, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLMedian3x3Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                               DataType::U8)),
-                                                                                                       datasets::BorderModes()))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLMedian3x3Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                               DataType::U8)),
-                                                                                                       datasets::BorderModes()))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/MinMaxLocation.cpp b/tests/validation/CL/MinMaxLocation.cpp
deleted file mode 100644
index 1ad863d..0000000
--- a/tests/validation/CL/MinMaxLocation.cpp
+++ /dev/null

@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLMinMaxLocation.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/CL/CLArrayAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/MinMaxLocationFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(CL)
-TEST_SUITE(MinMaxLocation)
-
-template <typename T>
-using CLMinMaxLocationFixture = MinMaxLocationValidationFixture<CLTensor, CLAccessor, CLArray<Coordinates2D>, CLArrayAccessor<Coordinates2D>, CLMinMaxLocation, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLMinMaxLocationFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
-                                                                                                            DataType::U8)))
-{
-    validate_min_max_loc(_target, _reference);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLMinMaxLocationFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("DataType",
-                                                                                                            DataType::U8)))
-{
-    validate_min_max_loc(_target, _reference);
-}
-
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLMinMaxLocationFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
-                                                                                                            DataType::S16)))
-{
-    validate_min_max_loc(_target, _reference);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLMinMaxLocationFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("DataType",
-                                                                                                            DataType::S16)))
-{
-    validate_min_max_loc(_target, _reference);
-}
-
-TEST_SUITE_END() // S16
-
-TEST_SUITE(Float)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLMinMaxLocationFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
-                                                                                                          DataType::F32)))
-{
-    validate_min_max_loc(_target, _reference);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLMinMaxLocationFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("DataType",
-                                                                                                          DataType::F32)))
-{
-    validate_min_max_loc(_target, _reference);
-}
-
-TEST_SUITE_END() // F32
-
-TEST_SUITE_END() // MinMaxLocation
-TEST_SUITE_END() // CL
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/NonLinearFilter.cpp b/tests/validation/CL/NonLinearFilter.cpp
deleted file mode 100644
index 3fd9d5c..0000000
--- a/tests/validation/CL/NonLinearFilter.cpp
+++ /dev/null

@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLNonLinearFilter.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/MatrixPatternDataset.h"
-#include "tests/datasets/NonLinearFilterFunctionDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/NonLinearFilterFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(CL)
-TEST_SUITE(NonLinearFilter)
-
-template <typename T>
-using CLNonLinearFilterFixture = NonLinearFilterValidationFixture<CLTensor, CLAccessor, CLNonLinearFilter, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLNonLinearFilterFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                     datasets::NonLinearFilterFunctions()),
-                                                                                                                     framework::dataset::make("MaskSize", { 3U, 5U })),
-                                                                                                                     datasets::MatrixPatterns()),
-                                                                                                                     datasets::BorderModes()),
-                                                                                                             framework::dataset::make("DataType", DataType::U8)))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), _border_size));
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLNonLinearFilterFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(datasets::LargeShapes(),
-                                                                                                                     datasets::NonLinearFilterFunctions()),
-                                                                                                                     framework::dataset::make("MaskSize", { 3U, 5U })),
-                                                                                                                     datasets::MatrixPatterns()),
-                                                                                                                     datasets::BorderModes()),
-                                                                                                             framework::dataset::make("DataType", DataType::U8)))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), _border_size));
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/OpticalFlow.cpp b/tests/validation/CL/OpticalFlow.cpp
deleted file mode 100644
index 3636a8f..0000000
--- a/tests/validation/CL/OpticalFlow.cpp
+++ /dev/null

@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/CLArray.h"
-#include "arm_compute/runtime/CL/CLPyramid.h"
-#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
-#include "arm_compute/runtime/CL/functions/CLOpticalFlow.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/CL/CLArrayAccessor.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/OpticalFlowDataset.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/OpticalFlowFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(CL)
-TEST_SUITE(OpticalFlow)
-
-// *INDENT-OFF*
-// clang-format off
-using CLOpticalFlowFixture = OpticalFlowValidationFixture<CLTensor,
-                                                          CLAccessor,
-                                                          CLKeyPointArray,
-                                                          CLArrayAccessor<KeyPoint>,
-                                                          CLOpticalFlow,
-                                                          CLPyramid,
-                                                          CLGaussianPyramidHalf,
-                                                          uint8_t>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLOpticalFlowFixture, framework::DatasetMode::NIGHTLY, combine(combine(
-                       datasets::SmallOpticalFlowDataset(),
-                       framework::dataset::make("Format", Format::U8)),
-                       datasets::BorderModes()))
-{
-    // Validate output
-    CLArrayAccessor<KeyPoint> array(_target);
-
-    validate_keypoints(array.buffer(),
-                       array.buffer() + array.num_values(),
-                       _reference.begin(),
-                       _reference.end());
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLOpticalFlowFixture, framework::DatasetMode::NIGHTLY, combine(combine(
-                       datasets::LargeOpticalFlowDataset(),
-                       framework::dataset::make("Format", Format::U8)),
-                       datasets::BorderModes()))
-{
-    // Validate output
-    CLArrayAccessor<KeyPoint> array(_target);
-
-    validate_keypoints(array.buffer(),
-                       array.buffer() + array.num_values(),
-                       _reference.begin(),
-                       _reference.end());
-}
-// clang-format on
-// *INDENT-ON*
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/Phase.cpp b/tests/validation/CL/Phase.cpp
deleted file mode 100644
index be7f9df..0000000
--- a/tests/validation/CL/Phase.cpp
+++ /dev/null

@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLPhase.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/PhaseFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr AbsoluteTolerance<uint8_t> tolerance_value(1);
-} // namespace
-
-TEST_SUITE(CL)
-TEST_SUITE(Phase)
-
-template <typename T>
-using CLPhaseFixture = PhaseValidationFixture<CLTensor, CLAccessor, CLPhase, T>;
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLPhaseFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SmallShapes(), framework::dataset::make("Format", Format::S16)),
-                                                                                                   framework::dataset::make("PhaseType", { PhaseType::SIGNED, PhaseType::UNSIGNED })))
-{
-    // Validate output
-    validate_wrap(CLAccessor(_target), _reference, tolerance_value, 0);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLPhaseFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::Large2DShapes(), framework::dataset::make("Format", Format::S16)),
-                                                                                                   framework::dataset::make("PhaseType", { PhaseType::SIGNED, PhaseType::UNSIGNED })))
-{
-    // Validate output
-    validate_wrap(CLAccessor(_target), _reference, tolerance_value, 0);
-}
-TEST_SUITE_END() // S16
-
-TEST_SUITE(S32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLPhaseFixture<int32_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SmallShapes(), framework::dataset::make("Format", Format::S32)),
-                                                                                                   framework::dataset::make("PhaseType", { PhaseType::SIGNED, PhaseType::UNSIGNED })))
-{
-    // Validate output
-    validate_wrap(CLAccessor(_target), _reference, tolerance_value, 0);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLPhaseFixture<int32_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::Large2DShapes(), framework::dataset::make("Format", Format::S32)),
-                                                                                                   framework::dataset::make("PhaseType", { PhaseType::SIGNED, PhaseType::UNSIGNED })))
-{
-    // Validate output
-    validate_wrap(CLAccessor(_target), _reference, tolerance_value, 0);
-}
-TEST_SUITE_END() // S32
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/Scharr.cpp b/tests/validation/CL/Scharr.cpp
deleted file mode 100644
index ed1fec8..0000000
--- a/tests/validation/CL/Scharr.cpp
+++ /dev/null

@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/GradientDimensionDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ScharrFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(CL)
-TEST_SUITE(Scharr)
-
-TEST_SUITE(W3x3)
-using CLScharr3x3Fixture = ScharrValidationFixture<CLTensor, CLAccessor, CLScharr3x3, uint8_t, int16_t>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLScharr3x3Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                      Format::U8)),
-                                                                                              datasets::GradientDimensions()))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(CLAccessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(CLAccessor(_target.second), _reference.second, valid_region_y);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLScharr3x3Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                      Format::U8)),
-                                                                                              datasets::GradientDimensions()))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(CLAccessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(CLAccessor(_target.second), _reference.second, valid_region_y);
-}
-TEST_SUITE_END()
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/Sobel.cpp b/tests/validation/CL/Sobel.cpp
deleted file mode 100644
index 3aee0fe..0000000
--- a/tests/validation/CL/Sobel.cpp
+++ /dev/null

@@ -1,259 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
-#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
-#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/SobelFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(CL)
-TEST_SUITE(Sobel)
-
-TEST_SUITE(W3x3)
-using CLSobel3x3Fixture = SobelValidationFixture<CLTensor, CLAccessor, CLSobel3x3, uint8_t, int16_t>;
-
-TEST_SUITE(X)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLSobel3x3Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_X)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(CLAccessor(_target.first), _reference.first, valid_region_x);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLSobel3x3Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_X)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(CLAccessor(_target.first), _reference.first, valid_region_x);
-}
-TEST_SUITE_END()
-
-TEST_SUITE(Y)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLSobel3x3Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_Y)))
-{
-    // Validate output
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(CLAccessor(_target.second), _reference.second, valid_region_y);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLSobel3x3Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_Y)))
-{
-    // Validate output
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(CLAccessor(_target.second), _reference.second, valid_region_y);
-}
-TEST_SUITE_END()
-
-TEST_SUITE(XY)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLSobel3x3Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_XY)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(CLAccessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(CLAccessor(_target.second), _reference.second, valid_region_y);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLSobel3x3Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_XY)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(CLAccessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(CLAccessor(_target.second), _reference.second, valid_region_y);
-}
-TEST_SUITE_END()
-TEST_SUITE_END()
-
-TEST_SUITE(W5x5)
-using CLSobel5x5Fixture = SobelValidationFixture<CLTensor, CLAccessor, CLSobel5x5, uint8_t, int16_t>;
-
-TEST_SUITE(X)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLSobel5x5Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_X)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(2));
-    validate(CLAccessor(_target.first), _reference.first, valid_region_x);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLSobel5x5Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_X)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(2));
-    validate(CLAccessor(_target.first), _reference.first, valid_region_x);
-}
-TEST_SUITE_END()
-TEST_SUITE(Y)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLSobel5x5Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_Y)))
-{
-    // Validate output
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(2));
-    validate(CLAccessor(_target.second), _reference.second, valid_region_y);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLSobel5x5Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_Y)))
-{
-    // Validate output
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(2));
-    validate(CLAccessor(_target.second), _reference.second, valid_region_y);
-}
-TEST_SUITE_END()
-TEST_SUITE(XY)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLSobel5x5Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_XY)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(2));
-    validate(CLAccessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(2));
-    validate(CLAccessor(_target.second), _reference.second, valid_region_y);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLSobel5x5Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_XY)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(2));
-    validate(CLAccessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(2));
-    validate(CLAccessor(_target.second), _reference.second, valid_region_y);
-}
-TEST_SUITE_END()
-TEST_SUITE_END()
-
-TEST_SUITE(W7x7)
-using CLSobel7x7Fixture = SobelValidationFixture<CLTensor, CLAccessor, CLSobel7x7, uint8_t, int32_t>;
-
-TEST_SUITE(X)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLSobel7x7Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_X)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(3));
-    validate(CLAccessor(_target.first), _reference.first, valid_region_x);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLSobel7x7Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_X)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(3));
-    validate(CLAccessor(_target.first), _reference.first, valid_region_x);
-}
-TEST_SUITE_END()
-TEST_SUITE(Y)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLSobel7x7Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_Y)))
-{
-    // Validate output
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(3));
-    validate(CLAccessor(_target.second), _reference.second, valid_region_y);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLSobel7x7Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_Y)))
-{
-    // Validate output
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(3));
-    validate(CLAccessor(_target.second), _reference.second, valid_region_y);
-}
-TEST_SUITE_END()
-TEST_SUITE(XY)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLSobel7x7Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_XY)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(3));
-    validate(CLAccessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(3));
-    validate(CLAccessor(_target.second), _reference.second, valid_region_y);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLSobel7x7Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_XY)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(3));
-    validate(CLAccessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(3));
-    validate(CLAccessor(_target.second), _reference.second, valid_region_y);
-}
-TEST_SUITE_END()
-TEST_SUITE_END()
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/TableLookup.cpp b/tests/validation/CL/TableLookup.cpp
deleted file mode 100644
index 415b91c..0000000
--- a/tests/validation/CL/TableLookup.cpp
+++ /dev/null

@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "arm_compute/runtime/CL/functions/CLTableLookup.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/CL/CLLutAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Helpers.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/TableLookupFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(CL)
-TEST_SUITE(TableLookup)
-
-template <typename T>
-using CLTableLookupFixture = TableLookupValidationFixture<CLTensor, CLAccessor, CLTableLookup, CLLutAccessor<T>, CLLut, T>;
-TEST_SUITE(U8)
-
-FIXTURE_DATA_TEST_CASE(RunSmallU8, CLTableLookupFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-FIXTURE_DATA_TEST_CASE(RunLargeU8, CLTableLookupFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::U8)))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-TEST_SUITE_END()
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmallS16, CLTableLookupFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::S16)))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-FIXTURE_DATA_TEST_CASE(RunLargeS16, CLTableLookupFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::S16)))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-TEST_SUITE_END()
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/Threshold.cpp b/tests/validation/CL/Threshold.cpp
deleted file mode 100644
index be26245..0000000
--- a/tests/validation/CL/Threshold.cpp
+++ /dev/null

@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLThreshold.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/datasets/ThresholdDataset.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ThresholdFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(CL)
-TEST_SUITE(Threshold)
-
-template <typename T>
-using CLThresholdFixture = ThresholdValidationFixture<CLTensor, CLAccessor, CLThreshold, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLThresholdFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SmallShapes(), datasets::MixedThresholdDataset()),
-                                                                                                       framework::dataset::make("DataType",
-                                                                                                               DataType::U8)))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLThresholdFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), datasets::MixedThresholdDataset()),
-                                                                                                       framework::dataset::make("DataType",
-                                                                                                               DataType::U8)))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/WarpAffine.cpp b/tests/validation/CL/WarpAffine.cpp
deleted file mode 100644
index d10ba7f..0000000
--- a/tests/validation/CL/WarpAffine.cpp
+++ /dev/null

@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/runtime/CL/functions/CLWarpAffine.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/InterpolationPolicyDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/WarpAffineFixture.h"
-#include "tests/validation/reference/Utils.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/** Tolerance */
-constexpr AbsoluteTolerance<uint8_t> tolerance(1);
-} // namespace
-
-TEST_SUITE(CL)
-TEST_SUITE(WarpAffine)
-
-template <typename T>
-using CLWarpAffineFixture = WarpAffineValidationFixture<CLTensor, CLAccessor, CLWarpAffine, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLWarpAffineFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)),
-                                                                                                                framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                        datasets::BorderModes()))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, _valid_mask, tolerance, 0.02f);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLWarpAffineFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::U8)),
-                                                                                                                framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                        datasets::BorderModes()))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, _valid_mask, tolerance, 0.02f);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/CL/WarpPerspective.cpp b/tests/validation/CL/WarpPerspective.cpp
deleted file mode 100644
index dd05059..0000000
--- a/tests/validation/CL/WarpPerspective.cpp
+++ /dev/null

@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/functions/CLWarpPerspective.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/WarpPerspectiveFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr AbsoluteTolerance<uint8_t> tolerance_value(1);
-constexpr float                      tolerance_number = 0.2f;
-} // namespace
-
-TEST_SUITE(CL)
-TEST_SUITE(WarpPerspective)
-
-template <typename T>
-using CLWarpPerspectiveFixture = WarpPerspectiveValidationFixture<CLTensor, CLAccessor, CLWarpPerspective, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLWarpPerspectiveFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                                     DataType::U8)),
-                                                                                                                     framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                             datasets::BorderModes()))
-{
-    validate(CLAccessor(_target), _reference, _valid_mask, tolerance_value, tolerance_number);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLWarpPerspectiveFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                                     DataType::U8)),
-                                                                                                                     framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                             datasets::BorderModes()))
-{
-    validate(CLAccessor(_target), _reference, _valid_mask, tolerance_value, tolerance_number);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/NEON/Convolution.cpp b/tests/validation/NEON/Convolution.cpp
deleted file mode 100644
index 2fb4327..0000000
--- a/tests/validation/NEON/Convolution.cpp
+++ /dev/null

@@ -1,295 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEConvolution.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ConvolutionFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/** Tolerance value for comparing reference's output against implementation
- *
- * This is due to the fact that Neon target performs multiplication with reciprocal of scale,
- * while reference performs direct division with scale.
- */
-constexpr AbsoluteTolerance<uint8_t> tolerance_u8(1);
-constexpr AbsoluteTolerance<int16_t> tolerance_s16(1);
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(CustomConvolution)
-TEST_SUITE(Square3x3)
-template <typename T>
-using NEConvolutionFixture = ConvolutionSquareValidationFixture<Tensor, Accessor, NEConvolution3x3, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 3 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_u8);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 3 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_s16);
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Square3x3
-
-TEST_SUITE(Square5x5)
-template <typename T>
-using NEConvolutionFixture = ConvolutionSquareValidationFixture<Tensor, Accessor, NEConvolution5x5, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 5 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_u8);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 5 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_s16);
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Square5x5
-
-TEST_SUITE(Square7x7)
-template <typename T>
-using NEConvolutionFixture = ConvolutionSquareValidationFixture<Tensor, Accessor, NEConvolution7x7, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 7 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_u8);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 7 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_s16);
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Square7x7
-
-TEST_SUITE(Square9x9)
-template <typename T>
-using NEConvolutionFixture = ConvolutionSquareValidationFixture<Tensor, Accessor, NEConvolution9x9, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 9 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_u8);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 9 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_s16);
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Square9x9
-
-TEST_SUITE(Rectangle)
-template <typename T>
-using NEConvolutionFixture = ConvolutionRectangleValidationFixture<Tensor, Accessor, NEConvolutionRectangle, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                                 framework::dataset::make("filter_width", { 3, 5, 7, 9 })),
-                                                                                                         framework::dataset::make("filter_height", { 3, 5, 7, 9 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_u8);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                                 framework::dataset::make("filter_width", { 3, 5, 7, 9 })),
-                                                                                                         framework::dataset::make("filter_height", { 3, 5, 7, 9 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_s16);
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Rectangle
-
-TEST_SUITE(Separable5x5)
-template <typename T>
-using NEConvolutionFixture = ConvolutionSeparableValidationFixture<Tensor, Accessor, NEConvolution5x5, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 5 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_u8);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 5 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_s16);
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Separable5x5
-
-TEST_SUITE(Separable7x7)
-template <typename T>
-using NEConvolutionFixture = ConvolutionSeparableValidationFixture<Tensor, Accessor, NEConvolution7x7, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 7 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_u8);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 7 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_s16);
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Separable7x7
-
-TEST_SUITE(Separable9x9)
-template <typename T>
-using NEConvolutionFixture = ConvolutionSeparableValidationFixture<Tensor, Accessor, NEConvolution9x9, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 9 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_u8);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 9 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_s16);
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Separable9x9
-
-TEST_SUITE_END() // CustomConvolution
-TEST_SUITE_END() // Neon
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/NEON/Remap.cpp b/tests/validation/NEON/Remap.cpp
new file mode 100644
index 0000000..3c02f8e
--- /dev/null
+++ b/tests/validation/NEON/Remap.cpp

@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NERemap.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/BorderModeDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/RemapFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+constexpr AbsoluteTolerance<uint8_t> tolerance_value(0);
+constexpr float                      tolerance_number = 0.f;
+} // namespace
+
+TEST_SUITE(NEON)
+TEST_SUITE(Remap)
+
+template <typename T>
+using NERemapFixture = RemapValidationFixture<Tensor, Accessor, NERemap, T>;
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NERemapFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
+                                                                                                           framework::dataset::make("DataType",
+                                                                                                                   DataType::U8)),
+                                                                                                   framework::dataset::make("BorderModes", { BorderMode::UNDEFINED, BorderMode::CONSTANT })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, _valid_mask, tolerance_value, tolerance_number);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NERemapFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
+                                                                                                           framework::dataset::make("DataType",
+                                                                                                                   DataType::U8)),
+                                                                                                   framework::dataset::make("BorderModes", { BorderMode::UNDEFINED, BorderMode::CONSTANT })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, _valid_mask, tolerance_value, tolerance_number);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/fixtures/AbsoluteDifferenceFixture.h b/tests/validation/fixtures/AbsoluteDifferenceFixture.h
deleted file mode 100644
index 46118c9..0000000
--- a/tests/validation/fixtures/AbsoluteDifferenceFixture.h
+++ /dev/null

@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_ABSOLUTE_DIFFERENCE_FIXTURE
-#define ARM_COMPUTE_TEST_ABSOLUTE_DIFFERENCE_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/Helpers.h"
-#include "tests/validation/reference/AbsoluteDifference.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class AbsoluteDifferenceValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, DataType data_type0, DataType data_type1, DataType output_data_type)
-    {
-        _target    = compute_target(shape, data_type0, data_type1, output_data_type);
-        _reference = compute_reference(shape, data_type0, data_type1, output_data_type);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor, int i)
-    {
-        library->fill_tensor_uniform(tensor, i);
-    }
-
-    TensorType compute_target(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type)
-    {
-        // Create tensors
-        TensorType ref_src1 = create_tensor<TensorType>(shape, data_type0, 1);
-        TensorType ref_src2 = create_tensor<TensorType>(shape, data_type1, 1);
-        TensorType dst      = create_tensor<TensorType>(shape, output_data_type, 1);
-
-        // Create and configure function
-        FunctionType abs_diff;
-        abs_diff.configure(&ref_src1, &ref_src2, &dst);
-
-        ARM_COMPUTE_EXPECT(ref_src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(ref_src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        ref_src1.allocator()->allocate();
-        ref_src2.allocator()->allocate();
-        dst.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!ref_src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!ref_src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(ref_src1), 0);
-        fill(AccessorType(ref_src2), 1);
-
-        // Compute function
-        abs_diff.run();
-
-        return dst;
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type)
-    {
-        // Create reference
-        SimpleTensor<T> ref_src1{ shape, data_type0, 1 };
-        SimpleTensor<T> ref_src2{ shape, data_type1, 1 };
-
-        // Fill reference
-        fill(ref_src1, 0);
-        fill(ref_src2, 1);
-
-        return reference::absolute_difference<T>(ref_src1, ref_src2, output_data_type);
-    }
-
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_ABSOLUTE_DIFFERENCE_FIXTURE */

diff --git a/tests/validation/fixtures/AccumulateFixture.h b/tests/validation/fixtures/AccumulateFixture.h
deleted file mode 100644
index 7cea29c..0000000
--- a/tests/validation/fixtures/AccumulateFixture.h
+++ /dev/null

@@ -1,193 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_ACCUMULATE_FIXTURE
-#define ARM_COMPUTE_TEST_ACCUMULATE_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/Helpers.h"
-#include "tests/validation/reference/Accumulate.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T1, typename T2>
-class AccumulateBaseValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, DataType data_type, DataType output_data_type)
-    {
-        _target    = compute_target(shape, data_type, output_data_type);
-        _reference = compute_reference(shape, data_type, output_data_type);
-    }
-
-protected:
-    template <typename U, typename D>
-    void fill(U &&tensor, int i, D max)
-    {
-        library->fill_tensor_uniform(tensor, i, static_cast<D>(0), max);
-    }
-
-    TensorType compute_target(const TensorShape &shape, DataType data_type, DataType output_data_type)
-    {
-        // Create tensors
-        TensorType ref_src = create_tensor<TensorType>(shape, data_type);
-        TensorType dst     = create_tensor<TensorType>(shape, output_data_type);
-
-        // Create and configure function
-        FunctionType accum;
-        accum_conf(accum, ref_src, dst);
-
-        ARM_COMPUTE_EXPECT(ref_src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        ref_src.allocator()->allocate();
-        dst.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!ref_src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        const T1 max = std::numeric_limits<T1>::max();
-
-        // Fill tensors
-        fill(AccessorType(ref_src), 0, max);
-        fill(AccessorType(dst), 1, static_cast<T2>(max));
-
-        // Compute function
-        accum.run();
-
-        return dst;
-    }
-
-    SimpleTensor<T2> compute_reference(const TensorShape &shape, DataType data_type, DataType output_data_type)
-    {
-        // Create reference
-        SimpleTensor<T1> ref_src{ shape, data_type };
-
-        const T1 max = std::numeric_limits<T1>::max();
-
-        // Fill reference
-        fill(ref_src, 0, max);
-
-        return accum_ref(ref_src, output_data_type);
-    }
-
-    virtual void accum_conf(FunctionType &func, const TensorType &input, TensorType &accum) = 0;
-
-    virtual SimpleTensor<T2> accum_ref(const SimpleTensor<T1> &input, DataType output_data_type) = 0;
-
-    TensorType       _target{};
-    SimpleTensor<T2> _reference{};
-};
-
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T1, typename T2>
-class AccumulateValidationFixture : public AccumulateBaseValidationFixture<TensorType, AccessorType, FunctionType, T1, T2>
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, DataType data_type, DataType output_data_type)
-    {
-        AccumulateBaseValidationFixture<TensorType, AccessorType, FunctionType, T1, T2>::setup(shape, data_type, output_data_type);
-    }
-
-    virtual void accum_conf(FunctionType &func, const TensorType &input, TensorType &accum) override
-    {
-        func.configure(&input, &accum);
-    }
-
-    virtual SimpleTensor<T2> accum_ref(const SimpleTensor<T1> &input, DataType output_data_type) override
-    {
-        return reference::accumulate<T1, T2>(input, output_data_type);
-    }
-};
-
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T1, typename T2>
-class AccumulateWeightedValidationFixture : public AccumulateBaseValidationFixture<TensorType, AccessorType, FunctionType, T1, T2>
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, DataType data_type, DataType output_data_type)
-    {
-        std::mt19937                          gen(library->seed());
-        std::uniform_real_distribution<float> float_dist(0, 1);
-
-        _alpha = float_dist(gen);
-
-        AccumulateBaseValidationFixture<TensorType, AccessorType, FunctionType, T1, T2>::setup(shape, data_type, output_data_type);
-    }
-
-    virtual void accum_conf(FunctionType &func, const TensorType &input, TensorType &accum) override
-    {
-        func.configure(&input, _alpha, &accum);
-    }
-
-    virtual SimpleTensor<T2> accum_ref(const SimpleTensor<T1> &input, DataType output_data_type) override
-    {
-        return reference::accumulate_weighted<T1, T2>(input, _alpha, output_data_type);
-    }
-
-    float _alpha{ 0.f };
-};
-
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T1, typename T2>
-class AccumulateSquaredValidationFixture : public AccumulateBaseValidationFixture<TensorType, AccessorType, FunctionType, T1, T2>
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, DataType data_type, DataType output_data_type)
-    {
-        std::mt19937                            gen(library->seed());
-        std::uniform_int_distribution<uint32_t> int_dist(0, 15);
-
-        _shift = int_dist(gen);
-
-        AccumulateBaseValidationFixture<TensorType, AccessorType, FunctionType, T1, T2>::setup(shape, data_type, output_data_type);
-    }
-
-    virtual void accum_conf(FunctionType &func, const TensorType &input, TensorType &accum) override
-    {
-        func.configure(&input, _shift, &accum);
-    }
-
-    virtual SimpleTensor<T2> accum_ref(const SimpleTensor<T1> &input, DataType output_data_type) override
-    {
-        return reference::accumulate_squared<T1, T2>(input, _shift, output_data_type);
-    }
-
-    uint32_t _shift{ 0U };
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_ACCUMULATE_FIXTURE */

diff --git a/tests/validation/fixtures/Box3x3Fixture.h b/tests/validation/fixtures/Box3x3Fixture.h
deleted file mode 100644
index 8caeec8..0000000
--- a/tests/validation/fixtures/Box3x3Fixture.h
+++ /dev/null

@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_BOX3X3_FIXTURE
-#define ARM_COMPUTE_TEST_BOX3X3_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/Box3x3.h"
-
-#include <random>
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class Box3x3ValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, DataType data_type, BorderMode border_mode)
-    {
-        std::mt19937                           gen(library->seed());
-        std::uniform_int_distribution<uint8_t> distribution(0, 255);
-        const uint8_t                          constant_border_value = distribution(gen);
-
-        _border_mode = border_mode;
-        _target      = compute_target(shape, data_type, border_mode, constant_border_value);
-        _reference   = compute_reference(shape, data_type, border_mode, constant_border_value);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor)
-    {
-        library->fill_tensor_uniform(tensor, 0);
-    }
-
-    TensorType compute_target(const TensorShape &shape, DataType data_type, BorderMode border_mode, uint8_t constant_border_value)
-    {
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, data_type);
-        TensorType dst = create_tensor<TensorType>(shape, data_type);
-
-        // Create and configure function
-        FunctionType box3x3;
-        box3x3.configure(&src, &dst, border_mode, constant_border_value);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        dst.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src));
-
-        // Compute function
-        box3x3.run();
-
-        return dst;
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape &shape, DataType data_type, BorderMode border_mode, uint8_t constant_border_value)
-    {
-        ARM_COMPUTE_ERROR_ON(data_type != DataType::U8);
-
-        // Create reference
-        SimpleTensor<T> src{ shape, data_type };
-
-        // Fill reference
-        fill(src);
-
-        // Compute reference
-        return reference::box3x3<T>(src, border_mode, constant_border_value);
-    }
-
-    BorderMode      _border_mode{};
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_BOX3X3_FIXTURE */

diff --git a/tests/validation/fixtures/CannyEdgeFixture.h b/tests/validation/fixtures/CannyEdgeFixture.h
deleted file mode 100644
index 8e82e6d..0000000
--- a/tests/validation/fixtures/CannyEdgeFixture.h
+++ /dev/null

@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_CANNY_EDGE_FIXTURE
-#define ARM_COMPUTE_TEST_CANNY_EDGE_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/Helpers.h"
-#include "tests/validation/reference/CannyEdgeDetector.h"
-
-namespace arm_compute
-{
-class CLCannyEdge;
-class NECannyEdge;
-
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename ArrayType, typename FunctionType, typename T>
-class CannyEdgeValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(std::string image, int gradient_size, MagnitudeType norm_type, BorderMode border_mode, Format format)
-    {
-        CannyEdgeParameters params = canny_edge_parameters();
-
-        _target    = compute_target(image, gradient_size, norm_type, border_mode, format, params);
-        _reference = compute_reference(image, gradient_size, norm_type, border_mode, format, params);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor, RawTensor raw)
-    {
-        library->fill(tensor, raw);
-    }
-
-    TensorType compute_target(const std::string &image, int gradient_size, MagnitudeType norm_type, BorderMode border_mode, Format format, const CannyEdgeParameters &params)
-    {
-        // Load the image (cached by the library if loaded before)
-        const RawTensor &raw = library->get(image, format);
-
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(raw.shape(), format);
-        TensorType dst = create_tensor<TensorType>(raw.shape(), format);
-        src.info()->set_format(format);
-        dst.info()->set_format(format);
-
-        // Create Canny edge configure function
-        FunctionType canny_edge;
-        canny_edge.configure(&src, &dst, params.upper_thresh, params.lower_thresh, gradient_size, static_cast<int>(norm_type) + 1, border_mode, params.constant_border_value);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        dst.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src), raw);
-
-        // Compute function
-        canny_edge.run();
-
-        return dst;
-    }
-
-    SimpleTensor<T> compute_reference(const std::string &image, int gradient_size, MagnitudeType norm_type, BorderMode border_mode, Format format, const CannyEdgeParameters &params)
-    {
-        ARM_COMPUTE_ERROR_ON(format != Format::U8);
-
-        // Load the image (cached by the library if loaded before)
-        const RawTensor &raw = library->get(image, format);
-
-        // Create reference
-        SimpleTensor<T> src{ raw.shape(), format };
-
-        // Fill reference
-        fill(src, raw);
-
-        return reference::canny_edge_detector<T>(src, params.upper_thresh, params.lower_thresh, gradient_size, norm_type, border_mode, params.constant_border_value);
-    }
-
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_CANNY_EDGE_FIXTURE */

diff --git a/tests/validation/fixtures/ChannelCombineFixture.h b/tests/validation/fixtures/ChannelCombineFixture.h
deleted file mode 100644
index f0d927a..0000000
--- a/tests/validation/fixtures/ChannelCombineFixture.h
+++ /dev/null

@@ -1,266 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_CHANNEL_COMBINE_FIXTURE
-#define ARM_COMPUTE_TEST_CHANNEL_COMBINE_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/Helpers.h"
-#include "tests/validation/reference/ChannelCombine.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-template <typename TensorType>
-inline std::vector<TensorType> create_tensor_planes(const TensorShape &shape, Format format)
-{
-    TensorShape image_shape = adjust_odd_shape(shape, format);
-    TensorInfo  info(image_shape, Format::U8);
-
-    std::vector<TensorType> tensor_planes;
-
-    switch(format)
-    {
-        case Format::RGB888:
-        case Format::RGBA8888:
-        case Format::YUV444:
-        {
-            tensor_planes.resize(3);
-
-            if(format == Format::RGBA8888)
-            {
-                tensor_planes.resize(4);
-            }
-
-            for(unsigned int plane_idx = 0; plane_idx < tensor_planes.size(); ++plane_idx)
-            {
-                tensor_planes[plane_idx].allocator()->init(info);
-            }
-
-            break;
-        }
-        case Format::YUYV422:
-        case Format::UYVY422:
-        {
-            const TensorShape uv_shape = calculate_subsampled_shape(image_shape, format);
-            const TensorInfo  info_hor2(uv_shape, Format::U8);
-
-            tensor_planes.resize(3);
-
-            tensor_planes[0].allocator()->init(info);
-            tensor_planes[1].allocator()->init(info_hor2);
-            tensor_planes[2].allocator()->init(info_hor2);
-            break;
-        }
-        case Format::NV12:
-        case Format::NV21:
-        case Format::IYUV:
-        {
-            const TensorShape sub2_shape = calculate_subsampled_shape(image_shape, format);
-            const TensorInfo  info_sub2(sub2_shape, Format::U8);
-
-            tensor_planes.resize(3);
-
-            tensor_planes[0].allocator()->init(info);
-            tensor_planes[1].allocator()->init(info_sub2);
-            tensor_planes[2].allocator()->init(info_sub2);
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-            break;
-    }
-
-    return tensor_planes;
-}
-} // namespace
-
-template <typename MultiImageType, typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class ChannelCombineValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, Format format)
-    {
-        _num_planes = num_planes_from_format(format);
-        _target     = compute_target(shape, format);
-        _reference  = compute_reference(shape, format);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor, int i)
-    {
-        library->fill_tensor_uniform(tensor, i);
-    }
-
-    template <typename U>
-    std::vector<SimpleTensor<U>> create_tensor_planes_reference(const TensorShape &shape, Format format)
-    {
-        std::vector<SimpleTensor<U>> tensor_planes;
-
-        TensorShape image_shape = adjust_odd_shape(shape, format);
-
-        switch(format)
-        {
-            case Format::RGB888:
-            case Format::RGBA8888:
-            case Format::YUV444:
-            {
-                if(format == Format::RGBA8888)
-                {
-                    tensor_planes.emplace_back(image_shape, Format::U8);
-                }
-
-                tensor_planes.emplace_back(image_shape, Format::U8);
-                tensor_planes.emplace_back(image_shape, Format::U8);
-                tensor_planes.emplace_back(image_shape, Format::U8);
-                break;
-            }
-            case Format::YUYV422:
-            case Format::UYVY422:
-            {
-                const TensorShape hor2_shape = calculate_subsampled_shape(image_shape, format);
-
-                tensor_planes.emplace_back(image_shape, Format::U8);
-                tensor_planes.emplace_back(hor2_shape, Format::U8);
-                tensor_planes.emplace_back(hor2_shape, Format::U8);
-                break;
-            }
-            case Format::NV12:
-            case Format::NV21:
-            case Format::IYUV:
-            {
-                const TensorShape shape_sub2 = calculate_subsampled_shape(image_shape, format);
-
-                tensor_planes.emplace_back(image_shape, Format::U8);
-                tensor_planes.emplace_back(shape_sub2, Format::U8);
-                tensor_planes.emplace_back(shape_sub2, Format::U8);
-                break;
-            }
-            default:
-                ARM_COMPUTE_ERROR("Not supported");
-                break;
-        }
-
-        return tensor_planes;
-    }
-
-    MultiImageType compute_target(const TensorShape &shape, Format format)
-    {
-        // Create tensors
-        std::vector<TensorType> ref_src = create_tensor_planes<TensorType>(shape, format);
-        MultiImageType          dst     = create_multi_image<MultiImageType>(shape, format);
-
-        // Create and configure function
-        FunctionType channel_combine;
-
-        if(1 == _num_planes)
-        {
-            const TensorType *tensor_extra = ((Format::RGBA8888 == format) ? &ref_src[3] : nullptr);
-            TensorType       *tensor_dst   = dynamic_cast<TensorType *>(dst.plane(0));
-
-            channel_combine.configure(&ref_src[0], &ref_src[1], &ref_src[2], tensor_extra, tensor_dst);
-        }
-        else
-        {
-            channel_combine.configure(&ref_src[0], &ref_src[1], &ref_src[2], &dst);
-        }
-
-        for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
-        {
-            const TensorType *dst_plane = static_cast<const TensorType *>(dst.plane(plane_idx));
-
-            ARM_COMPUTE_EXPECT(dst_plane->info()->is_resizable(), framework::LogLevel::ERRORS);
-        }
-
-        for(unsigned int plane_idx = 0; plane_idx < ref_src.size(); ++plane_idx)
-        {
-            ARM_COMPUTE_EXPECT(ref_src[plane_idx].info()->is_resizable(), framework::LogLevel::ERRORS);
-        }
-
-        // Allocate tensors
-        dst.allocate();
-
-        for(unsigned int plane_idx = 0; plane_idx < ref_src.size(); ++plane_idx)
-        {
-            ref_src[plane_idx].allocator()->allocate();
-        }
-
-        for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
-        {
-            const TensorType *dst_plane = static_cast<const TensorType *>(dst.plane(plane_idx));
-
-            ARM_COMPUTE_EXPECT(!dst_plane->info()->is_resizable(), framework::LogLevel::ERRORS);
-        }
-
-        for(unsigned int plane_idx = 0; plane_idx < ref_src.size(); ++plane_idx)
-        {
-            ARM_COMPUTE_EXPECT(!ref_src[plane_idx].info()->is_resizable(), framework::LogLevel::ERRORS);
-        }
-
-        // Fill tensor planes
-        for(unsigned int plane_idx = 0; plane_idx < ref_src.size(); ++plane_idx)
-        {
-            fill(AccessorType(ref_src[plane_idx]), plane_idx);
-        }
-
-        // Compute function
-        channel_combine.run();
-
-        return dst;
-    }
-
-    std::vector<SimpleTensor<T>> compute_reference(const TensorShape &shape, Format format)
-    {
-        // Create reference
-        std::vector<SimpleTensor<T>> ref_src = create_tensor_planes_reference<T>(shape, format);
-
-        // Fill references
-        for(unsigned int plane_idx = 0; plane_idx < ref_src.size(); ++plane_idx)
-        {
-            fill(ref_src[plane_idx], plane_idx);
-        }
-
-        return reference::channel_combine<T>(shape, ref_src, format);
-    }
-
-    unsigned int                 _num_planes{};
-    MultiImageType               _target{};
-    std::vector<SimpleTensor<T>> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_CHANNEL_COMBINE_FIXTURE */

diff --git a/tests/validation/fixtures/ChannelExtractFixture.h b/tests/validation/fixtures/ChannelExtractFixture.h
deleted file mode 100644
index 2f5694f..0000000
--- a/tests/validation/fixtures/ChannelExtractFixture.h
+++ /dev/null

@@ -1,192 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_CHANNEL_EXTRACT_FIXTURE
-#define ARM_COMPUTE_TEST_CHANNEL_EXTRACT_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/Helpers.h"
-#include "tests/validation/reference/ChannelExtract.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename MultiImageType, typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class ChannelExtractValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, Format format, Channel channel)
-    {
-        shape = adjust_odd_shape(shape, format);
-
-        _target    = compute_target(shape, format, channel);
-        _reference = compute_reference(shape, format, channel);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor, int i)
-    {
-        library->fill_tensor_uniform(tensor, i);
-    }
-
-    std::vector<SimpleTensor<T>> create_tensor_planes_reference(const TensorShape &shape, Format format)
-    {
-        TensorShape input = adjust_odd_shape(shape, format);
-
-        std::vector<SimpleTensor<T>> tensor_planes;
-
-        switch(format)
-        {
-            case Format::RGB888:
-            case Format::RGBA8888:
-            case Format::YUYV422:
-            case Format::UYVY422:
-            {
-                tensor_planes.emplace_back(input, format);
-                break;
-            }
-            case Format::NV12:
-            case Format::NV21:
-            {
-                const TensorShape shape_uv88 = calculate_subsampled_shape(shape, Format::UV88);
-
-                tensor_planes.emplace_back(input, Format::U8);
-                tensor_planes.emplace_back(shape_uv88, Format::UV88);
-                break;
-            }
-            case Format::IYUV:
-            {
-                const TensorShape shape_sub2 = calculate_subsampled_shape(shape, Format::IYUV);
-
-                tensor_planes.emplace_back(input, Format::U8);
-                tensor_planes.emplace_back(shape_sub2, Format::U8);
-                tensor_planes.emplace_back(shape_sub2, Format::U8);
-                break;
-            }
-            case Format::YUV444:
-                tensor_planes.emplace_back(input, Format::U8);
-                tensor_planes.emplace_back(input, Format::U8);
-                tensor_planes.emplace_back(input, Format::U8);
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Not supported");
-                break;
-        }
-
-        return tensor_planes;
-    }
-
-    TensorType compute_target(const TensorShape &shape, Format format, Channel channel)
-    {
-        const unsigned int num_planes = num_planes_from_format(format);
-
-        TensorShape dst_shape = calculate_subsampled_shape(shape, format, channel);
-
-        // Create tensors
-        MultiImageType ref_src = create_multi_image<MultiImageType>(shape, format);
-        TensorType     dst     = create_tensor<TensorType>(dst_shape, Format::U8);
-
-        // Create and configure function
-        FunctionType channel_extract;
-
-        if(1U == num_planes)
-        {
-            const TensorType *plane_src = static_cast<TensorType *>(ref_src.plane(0));
-
-            channel_extract.configure(plane_src, channel, &dst);
-        }
-        else
-        {
-            channel_extract.configure(&ref_src, channel, &dst);
-        }
-
-        for(unsigned int plane_idx = 0; plane_idx < num_planes; ++plane_idx)
-        {
-            const TensorType *src_plane = static_cast<const TensorType *>(ref_src.plane(plane_idx));
-
-            ARM_COMPUTE_EXPECT(src_plane->info()->is_resizable(), framework::LogLevel::ERRORS);
-        }
-
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        ref_src.allocate();
-        dst.allocator()->allocate();
-
-        for(unsigned int plane_idx = 0; plane_idx < num_planes; ++plane_idx)
-        {
-            const TensorType *src_plane = static_cast<const TensorType *>(ref_src.plane(plane_idx));
-
-            ARM_COMPUTE_EXPECT(!src_plane->info()->is_resizable(), framework::LogLevel::ERRORS);
-        }
-
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensor planes
-        for(unsigned int plane_idx = 0; plane_idx < num_planes; ++plane_idx)
-        {
-            TensorType *src_plane = static_cast<TensorType *>(ref_src.plane(plane_idx));
-
-            fill(AccessorType(*src_plane), plane_idx);
-        }
-
-        // Compute function
-        channel_extract.run();
-
-        return dst;
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape &shape, Format format, Channel channel)
-    {
-        const unsigned int num_planes = num_planes_from_format(format);
-
-        // Create reference
-        std::vector<SimpleTensor<T>> ref_src = create_tensor_planes_reference(shape, format);
-
-        // Fill references
-        for(unsigned int plane_idx = 0; plane_idx < num_planes; ++plane_idx)
-        {
-            fill(ref_src[plane_idx], plane_idx);
-        }
-
-        return reference::channel_extract<T>(shape, ref_src, format, channel);
-    }
-
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_CHANNEL_EXTRACT_FIXTURE */

diff --git a/tests/validation/fixtures/ColorConvertFixture.h b/tests/validation/fixtures/ColorConvertFixture.h
deleted file mode 100644
index a5ed554..0000000
--- a/tests/validation/fixtures/ColorConvertFixture.h
+++ /dev/null

@@ -1,218 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_COLOR_CONVERT_FIXTURE
-#define ARM_COMPUTE_TEST_COLOR_CONVERT_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/Helpers.h"
-#include "tests/validation/reference/ColorConvert.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-}
-template <typename MultiImageType, typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class ColorConvertValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, Format src_format, Format dst_format)
-    {
-        shape = adjust_odd_shape(shape, src_format);
-        shape = adjust_odd_shape(shape, dst_format);
-
-        _target    = compute_target(shape, src_format, dst_format);
-        _reference = compute_reference(shape, src_format, dst_format);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor, int i)
-    {
-        library->fill_tensor_uniform(tensor, i);
-    }
-
-    std::vector<SimpleTensor<T>> create_tensor_planes_reference(const TensorShape &shape, Format format)
-    {
-        std::vector<SimpleTensor<T>> tensor_planes;
-
-        switch(format)
-        {
-            case Format::RGB888:
-            case Format::RGBA8888:
-            case Format::YUYV422:
-            case Format::UYVY422:
-            {
-                tensor_planes.emplace_back(shape, format);
-                break;
-            }
-            case Format::NV12:
-            case Format::NV21:
-            {
-                const TensorShape shape_uv88 = calculate_subsampled_shape(shape, Format::UV88);
-
-                tensor_planes.emplace_back(shape, Format::U8);
-                tensor_planes.emplace_back(shape_uv88, Format::UV88);
-                break;
-            }
-            case Format::IYUV:
-            {
-                const TensorShape shape_sub2 = calculate_subsampled_shape(shape, Format::IYUV);
-
-                tensor_planes.emplace_back(shape, Format::U8);
-                tensor_planes.emplace_back(shape_sub2, Format::U8);
-                tensor_planes.emplace_back(shape_sub2, Format::U8);
-                break;
-            }
-            case Format::YUV444:
-            {
-                tensor_planes.emplace_back(shape, Format::U8);
-                tensor_planes.emplace_back(shape, Format::U8);
-                tensor_planes.emplace_back(shape, Format::U8);
-                break;
-            }
-            default:
-                ARM_COMPUTE_ERROR("Not supported");
-                break;
-        }
-
-        return tensor_planes;
-    }
-
-    MultiImageType compute_target(const TensorShape &shape, Format src_format, Format dst_format)
-    {
-        _src_num_planes = num_planes_from_format(src_format);
-        _dst_num_planes = num_planes_from_format(dst_format);
-
-        // Create tensors
-        MultiImageType ref_src = create_multi_image<MultiImageType>(shape, src_format);
-        MultiImageType ref_dst = create_multi_image<MultiImageType>(shape, dst_format);
-
-        // Create and configure function
-        FunctionType color_convert;
-
-        if(1U == _src_num_planes)
-        {
-            const TensorType *plane_src = static_cast<TensorType *>(ref_src.plane(0));
-
-            if(1U == _dst_num_planes)
-            {
-                TensorType *plane_dst = static_cast<TensorType *>(ref_dst.plane(0));
-                color_convert.configure(plane_src, plane_dst);
-            }
-            else
-            {
-                color_convert.configure(plane_src, &ref_dst);
-            }
-        }
-        else
-        {
-            if(1U == _dst_num_planes)
-            {
-                TensorType *plane_dst = static_cast<TensorType *>(ref_dst.plane(0));
-                color_convert.configure(&ref_src, plane_dst);
-            }
-            else
-            {
-                color_convert.configure(&ref_src, &ref_dst);
-            }
-        }
-
-        for(unsigned int plane_idx = 0; plane_idx < _src_num_planes; ++plane_idx)
-        {
-            const TensorType *src_plane = static_cast<const TensorType *>(ref_src.plane(plane_idx));
-
-            ARM_COMPUTE_EXPECT(src_plane->info()->is_resizable(), framework::LogLevel::ERRORS);
-        }
-        for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-        {
-            const TensorType *dst_plane = static_cast<const TensorType *>(ref_dst.plane(plane_idx));
-
-            ARM_COMPUTE_EXPECT(dst_plane->info()->is_resizable(), framework::LogLevel::ERRORS);
-        }
-
-        // Allocate tensors
-        ref_src.allocate();
-        ref_dst.allocate();
-
-        for(unsigned int plane_idx = 0; plane_idx < _src_num_planes; ++plane_idx)
-        {
-            const TensorType *src_plane = static_cast<const TensorType *>(ref_src.plane(plane_idx));
-            ARM_COMPUTE_EXPECT(!src_plane->info()->is_resizable(), framework::LogLevel::ERRORS);
-        }
-
-        for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-        {
-            const TensorType *dst_plane = static_cast<const TensorType *>(ref_dst.plane(plane_idx));
-            ARM_COMPUTE_EXPECT(!dst_plane->info()->is_resizable(), framework::LogLevel::ERRORS);
-        }
-
-        // Fill tensor planes
-        for(unsigned int plane_idx = 0; plane_idx < _src_num_planes; ++plane_idx)
-        {
-            TensorType *src_plane = static_cast<TensorType *>(ref_src.plane(plane_idx));
-
-            fill(AccessorType(*src_plane), plane_idx);
-        }
-
-        // Compute function
-        color_convert.run();
-
-        return ref_dst;
-    }
-
-    std::vector<SimpleTensor<T>> compute_reference(const TensorShape &shape, Format src_format, Format dst_format)
-    {
-        // Create reference
-        std::vector<SimpleTensor<T>> ref_src = create_tensor_planes_reference(shape, src_format);
-
-        // Fill references
-        for(unsigned int plane_idx = 0; plane_idx < ref_src.size(); ++plane_idx)
-        {
-            fill(ref_src[plane_idx], plane_idx);
-        }
-
-        return reference::color_convert<T>(shape, ref_src, src_format, dst_format);
-    }
-
-    unsigned int                 _src_num_planes{};
-    unsigned int                 _dst_num_planes{};
-    MultiImageType               _target{};
-    std::vector<SimpleTensor<T>> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_COLOR_CONVERT_FIXTURE */

diff --git a/tests/validation/fixtures/ConvolutionFixture.h b/tests/validation/fixtures/ConvolutionFixture.h
deleted file mode 100644
index 4692e2f..0000000
--- a/tests/validation/fixtures/ConvolutionFixture.h
+++ /dev/null

@@ -1,235 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_CONVOLUTION_FIXTURE
-#define ARM_COMPUTE_TEST_CONVOLUTION_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/Convolution.h"
-
-#include <random>
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class ConvolutionValidationFixture : public framework::Fixture
-{
-protected:
-    template <typename...>
-    void setup(TensorShape shape, DataType output_data_type, BorderMode border_mode, const unsigned int width, const unsigned int height, const bool is_separable = false)
-    {
-        std::mt19937                           gen(library->seed());
-        std::uniform_int_distribution<uint8_t> distribution(0, 255);
-        std::uniform_int_distribution<uint8_t> scale_distribution(1, 255);
-        const uint8_t                          constant_border_value = distribution(gen);
-
-        // Generate random scale value between 1 and 255.
-        const uint32_t scale = scale_distribution(gen);
-
-        ARM_COMPUTE_ERROR_ON(3 != width && 5 != width && 7 != width && 9 != width);
-        ARM_COMPUTE_ERROR_ON(3 != height && 5 != height && 7 != height && 9 != height);
-
-        std::vector<int16_t> conv(width * height);
-
-        _width  = width;
-        _height = height;
-
-        if(is_separable)
-        {
-            init_separable_conv(conv.data(), width, height, library->seed());
-        }
-        else
-        {
-            init_conv(conv.data(), width, height, library->seed());
-        }
-
-        _target    = compute_target(shape, output_data_type, conv.data(), scale, border_mode, constant_border_value);
-        _reference = compute_reference(shape, output_data_type, conv.data(), scale, border_mode, constant_border_value);
-    }
-
-    template <typename U>
-    void fill(U &&tensor, int i)
-    {
-        library->fill_tensor_uniform(tensor, i);
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape &shape, DataType output_data_type, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
-    {
-        // Create reference
-        SimpleTensor<uint8_t> src{ shape, DataType::U8 };
-
-        // Fill reference
-        fill(src, 0);
-
-        // Compute reference
-        return reference::convolution<T>(src, output_data_type, conv, scale, border_mode, constant_border_value, _width, _height);
-    }
-
-    virtual TensorType compute_target(const TensorShape &shape, DataType output_data_type, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value) = 0;
-
-    BorderMode      _border_mode{};
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-    unsigned int    _width{};
-    unsigned int    _height{};
-};
-
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class ConvolutionSquareValidationFixture : public ConvolutionValidationFixture<TensorType, AccessorType, FunctionType, T>
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, DataType output_data_type, BorderMode border_mode, const unsigned int width)
-    {
-        ConvolutionValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, output_data_type, border_mode, width, width);
-    }
-
-protected:
-    TensorType compute_target(const TensorShape &shape, DataType output_data_type, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
-    {
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, DataType::U8);
-        TensorType dst = create_tensor<TensorType>(shape, output_data_type);
-
-        // Create and configure function
-        FunctionType convolution;
-        convolution.configure(&src, &dst, conv, scale, border_mode, constant_border_value);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        dst.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        this->fill(AccessorType(src), 0);
-        this->fill(AccessorType(dst), 1);
-
-        // Compute function
-        convolution.run();
-
-        return dst;
-    }
-};
-
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class ConvolutionSeparableValidationFixture : public ConvolutionValidationFixture<TensorType, AccessorType, FunctionType, T>
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, DataType output_data_type, BorderMode border_mode, const unsigned int width)
-    {
-        ConvolutionValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, output_data_type, border_mode, width, width, true);
-    }
-
-protected:
-    TensorType compute_target(const TensorShape &shape, DataType output_data_type, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
-    {
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, DataType::U8);
-        TensorType dst = create_tensor<TensorType>(shape, output_data_type);
-
-        // Create and configure function
-        FunctionType convolution;
-        convolution.configure(&src, &dst, conv, scale, border_mode, constant_border_value);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        dst.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        this->fill(AccessorType(src), 0);
-        this->fill(AccessorType(dst), 1);
-
-        // Compute function
-        convolution.run();
-
-        return dst;
-    }
-};
-
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class ConvolutionRectangleValidationFixture : public ConvolutionValidationFixture<TensorType, AccessorType, FunctionType, T>
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, DataType output_data_type, BorderMode border_mode, const unsigned int width, const unsigned int height)
-    {
-        ConvolutionValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, output_data_type, border_mode, width, height);
-    }
-
-protected:
-    TensorType compute_target(const TensorShape &shape, DataType output_data_type, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
-    {
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, DataType::U8);
-        TensorType dst = create_tensor<TensorType>(shape, output_data_type);
-
-        // Create and configure function
-        FunctionType convolution;
-        convolution.configure(&src, &dst, conv, this->_width, this->_height, scale, border_mode, constant_border_value);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        dst.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        this->fill(AccessorType(src), 0);
-        this->fill(AccessorType(dst), 1);
-
-        // Compute function
-        convolution.run();
-
-        return dst;
-    }
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_CONVOLUTION_FIXTURE */

diff --git a/tests/validation/fixtures/DerivativeFixture.h b/tests/validation/fixtures/DerivativeFixture.h
deleted file mode 100644
index e520a9e..0000000
--- a/tests/validation/fixtures/DerivativeFixture.h
+++ /dev/null

@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_DERIVATIVE_FIXTURE
-#define ARM_COMPUTE_TEST_DERIVATIVE_FIXTURE
-
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/Types.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/Derivative.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T, typename U>
-class DerivativeValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, BorderMode border_mode, Format format, GradientDimension gradient_dimension)
-    {
-        // Generate a random constant value
-        std::mt19937                           gen(library->seed());
-        std::uniform_int_distribution<uint8_t> int_dist(0, 255);
-        const uint8_t                          constant_border_value = int_dist(gen);
-
-        _border_mode = border_mode;
-        _target      = compute_target(shape, border_mode, format, constant_border_value, gradient_dimension);
-        _reference   = compute_reference(shape, border_mode, format, constant_border_value, gradient_dimension);
-    }
-
-protected:
-    template <typename V>
-    void fill(V &&tensor)
-    {
-        library->fill_tensor_uniform(tensor, 0);
-    }
-
-    template <typename V>
-    void fill_zero(V &&tensor)
-    {
-        library->fill_tensor_uniform(tensor, 0, static_cast<U>(0), static_cast<U>(0));
-    }
-
-    std::pair<TensorType, TensorType> compute_target(const TensorShape &shape, BorderMode border_mode, Format format, uint8_t constant_border_value, GradientDimension gradient_dimension)
-    {
-        // Create tensors
-        TensorType src   = create_tensor<TensorType>(shape, data_type_from_format(format));
-        TensorType dst_x = create_tensor<TensorType>(shape, data_type_from_format(Format::S16));
-        TensorType dst_y = create_tensor<TensorType>(shape, data_type_from_format(Format::S16));
-
-        src.info()->set_format(format);
-        dst_x.info()->set_format(Format::S16);
-        dst_y.info()->set_format(Format::S16);
-
-        FunctionType derivative;
-
-        switch(gradient_dimension)
-        {
-            case GradientDimension::GRAD_X:
-                derivative.configure(&src, &dst_x, nullptr, border_mode, constant_border_value);
-                break;
-            case GradientDimension::GRAD_Y:
-                derivative.configure(&src, nullptr, &dst_y, border_mode, constant_border_value);
-                break;
-            case GradientDimension::GRAD_XY:
-                derivative.configure(&src, &dst_x, &dst_y, border_mode, constant_border_value);
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Gradient dimension not supported");
-        }
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst_x.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst_y.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        dst_x.allocator()->allocate();
-        dst_y.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst_x.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst_y.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src));
-        fill_zero(AccessorType(dst_x));
-        fill_zero(AccessorType(dst_y));
-
-        // Compute function
-        derivative.run();
-
-        return std::make_pair(std::move(dst_x), std::move(dst_y));
-    }
-
-    std::pair<SimpleTensor<U>, SimpleTensor<U>> compute_reference(const TensorShape &shape, BorderMode border_mode, Format format, uint8_t constant_border_value, GradientDimension gradient_dimension)
-    {
-        // Create reference
-        SimpleTensor<T> src{ shape, format };
-
-        // Fill reference
-        fill(src);
-
-        return reference::derivative<U>(src, border_mode, constant_border_value, gradient_dimension);
-    }
-
-    BorderMode _border_mode{ BorderMode::UNDEFINED };
-    std::pair<TensorType, TensorType>           _target{};
-    std::pair<SimpleTensor<U>, SimpleTensor<U>> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_DERIVATIVE_FIXTURE */

diff --git a/tests/validation/fixtures/DilateFixture.h b/tests/validation/fixtures/DilateFixture.h
deleted file mode 100644
index 51ed4df..0000000
--- a/tests/validation/fixtures/DilateFixture.h
+++ /dev/null

@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_DILATE_FIXTURE
-#define ARM_COMPUTE_TEST_DILATE_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/Dilate.h"
-
-#include <random>
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class DilateValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, DataType data_type, BorderMode border_mode)
-    {
-        std::mt19937                           gen(library->seed());
-        std::uniform_int_distribution<uint8_t> distribution(0, 255);
-        const uint8_t                          constant_border_value = distribution(gen);
-
-        _border_mode = border_mode;
-        _target      = compute_target(shape, data_type, border_mode, constant_border_value);
-        _reference   = compute_reference(shape, data_type, border_mode, constant_border_value);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor)
-    {
-        library->fill_tensor_uniform(tensor, 0);
-    }
-
-    TensorType compute_target(const TensorShape &shape, DataType data_type, BorderMode border_mode, uint8_t constant_border_value)
-    {
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, data_type);
-        TensorType dst = create_tensor<TensorType>(shape, data_type);
-
-        // Create and configure function
-        FunctionType dilate;
-        dilate.configure(&src, &dst, border_mode, constant_border_value);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        dst.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src));
-
-        // Compute function
-        dilate.run();
-
-        return dst;
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape &shape, DataType data_type, BorderMode border_mode, uint8_t constant_border_value)
-    {
-        ARM_COMPUTE_ERROR_ON(data_type != DataType::U8);
-
-        // Create reference
-        SimpleTensor<T> src{ shape, data_type };
-
-        // Fill reference
-        fill(src);
-
-        // Compute reference
-        return reference::dilate<T>(src, border_mode, constant_border_value);
-    }
-
-    BorderMode      _border_mode{};
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_DILATE_FIXTURE */

diff --git a/tests/validation/fixtures/EqualizeHistogramFixture.h b/tests/validation/fixtures/EqualizeHistogramFixture.h
deleted file mode 100644
index f7a0312..0000000
--- a/tests/validation/fixtures/EqualizeHistogramFixture.h
+++ /dev/null

@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_EQUALIZE_HISTOGRAM_FIXTURE
-#define ARM_COMPUTE_TEST_EQUALIZE_HISTOGRAM_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/EqualizeHistogram.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class EqualizeHistogramValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, DataType data_type)
-    {
-        _target    = compute_target(shape, data_type);
-        _reference = compute_reference(shape, data_type);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor)
-    {
-        library->fill_tensor_uniform(tensor, 0);
-    }
-
-    TensorType compute_target(const TensorShape &shape, DataType data_type)
-    {
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, data_type);
-        TensorType dst = create_tensor<TensorType>(shape, data_type);
-
-        // Create and configure function
-        FunctionType equalize_histogram;
-
-        equalize_histogram.configure(&src, &dst);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        dst.allocator()->allocate();
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src));
-
-        // Compute function
-        equalize_histogram.run();
-
-        return dst;
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape &shape, DataType data_type)
-    {
-        // Create reference
-        SimpleTensor<T> src{ shape, data_type };
-
-        // Fill reference
-        fill(src);
-
-        return reference::equalize_histogram<T>(src);
-    }
-
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_EQUALIZE_HISTOGRAM_FIXTURE */

diff --git a/tests/validation/fixtures/ErodeFixture.h b/tests/validation/fixtures/ErodeFixture.h
deleted file mode 100644
index b9f17a2..0000000
--- a/tests/validation/fixtures/ErodeFixture.h
+++ /dev/null

@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_ERODE_FIXTURE
-#define ARM_COMPUTE_TEST_ERODE_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/Erode.h"
-
-#include <random>
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class ErodeValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, DataType data_type, BorderMode border_mode)
-    {
-        std::mt19937                           gen(library->seed());
-        std::uniform_int_distribution<uint8_t> distribution(0, 255);
-        const uint8_t                          constant_border_value = distribution(gen);
-
-        _border_mode = border_mode;
-        _target      = compute_target(shape, data_type, border_mode, constant_border_value);
-        _reference   = compute_reference(shape, data_type, border_mode, constant_border_value);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor)
-    {
-        library->fill_tensor_uniform(tensor, 0);
-    }
-
-    TensorType compute_target(const TensorShape &shape, DataType data_type, BorderMode border_mode, uint8_t constant_border_value)
-    {
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, data_type);
-        TensorType dst = create_tensor<TensorType>(shape, data_type);
-
-        // Create and configure function
-        FunctionType erode;
-        erode.configure(&src, &dst, border_mode, constant_border_value);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        dst.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src));
-
-        // Compute function
-        erode.run();
-
-        return dst;
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape &shape, DataType data_type, BorderMode border_mode, uint8_t constant_border_value)
-    {
-        ARM_COMPUTE_ERROR_ON(data_type != DataType::U8);
-
-        // Create reference
-        SimpleTensor<T> src{ shape, data_type };
-
-        // Fill reference
-        fill(src);
-
-        // Compute reference
-        return reference::erode<T>(src, border_mode, constant_border_value);
-    }
-
-    BorderMode      _border_mode{};
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_ERODE_FIXTURE */

diff --git a/tests/validation/fixtures/FastCornersFixture.h b/tests/validation/fixtures/FastCornersFixture.h
deleted file mode 100644
index ae66c37..0000000
--- a/tests/validation/fixtures/FastCornersFixture.h
+++ /dev/null

@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_FAST_CORNERS_FIXTURE
-#define ARM_COMPUTE_TEST_FAST_CORNERS_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/FastCorners.h"
-
-#include <random>
-
-namespace arm_compute
-{
-class CLFastCorners;
-class NEFastCorners;
-
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename ArrayType, typename FunctionType, typename T>
-class FastCornersValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(std::string image, Format format, bool suppress_nonmax, BorderMode border_mode)
-    {
-        std::mt19937                           gen(library->seed());
-        std::uniform_int_distribution<uint8_t> int_dist(0, 255);
-        std::uniform_real_distribution<float>  real_dist(0, 255);
-
-        const uint8_t constant_border_value = int_dist(gen);
-        const float   threshold             = real_dist(gen);
-
-        _target    = compute_target(image, format, threshold, suppress_nonmax, border_mode, constant_border_value);
-        _reference = compute_reference(image, format, threshold, suppress_nonmax, border_mode, constant_border_value);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor, RawTensor raw)
-    {
-        library->fill(tensor, raw);
-    }
-
-    template <typename F, typename std::enable_if<std::is_same<F, CLFastCorners>::value, int>::type = 0>
-    void configure_target(F &func, TensorType &src, ArrayType &corners, unsigned int *num_corners, float threshold, bool suppress_nonmax, BorderMode border_mode, uint8_t constant_border_value)
-    {
-        func.configure(&src, threshold, suppress_nonmax, &corners, num_corners, border_mode, constant_border_value);
-    }
-
-    template <typename F, typename std::enable_if<std::is_same<F, NEFastCorners>::value, int>::type = 0>
-    void configure_target(F &func, TensorType &src, ArrayType &corners, unsigned int *num_corners, float threshold, bool suppress_nonmax, BorderMode border_mode, uint8_t constant_border_value)
-    {
-        ARM_COMPUTE_UNUSED(num_corners);
-        func.configure(&src, threshold, suppress_nonmax, &corners, border_mode, constant_border_value);
-    }
-
-    ArrayType compute_target(const std::string &image, Format format, float threshold, bool suppress_nonmax, BorderMode border_mode, uint8_t constant_border_value)
-    {
-        // Load the image (cached by the library if loaded before)
-        const RawTensor &raw = library->get(image, format);
-
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(raw.shape(), format);
-
-        // Create array of keypoints
-        ArrayType    corners(raw.shape().total_size());
-        unsigned int num_corners = raw.shape().total_size();
-
-        // Create and configure function
-        FunctionType fast_corners;
-        configure_target<FunctionType>(fast_corners, src, corners, &num_corners, threshold, suppress_nonmax, border_mode, constant_border_value);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src), raw);
-
-        // Compute function
-        fast_corners.run();
-
-        return corners;
-    }
-
-    std::vector<KeyPoint> compute_reference(const std::string &image, Format format, float threshold, bool suppress_nonmax, BorderMode border_mode, uint8_t constant_border_value)
-    {
-        // Load the image (cached by the library if loaded before)
-        const RawTensor &raw = library->get(image, format);
-
-        // Create reference
-        SimpleTensor<T> src{ raw.shape(), format };
-
-        // Fill reference
-        fill(src, raw);
-
-        // Compute reference
-        return reference::fast_corners<T>(src, threshold, suppress_nonmax, border_mode, constant_border_value);
-    }
-
-    ArrayType             _target{};
-    std::vector<KeyPoint> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_FAST_CORNERS_FIXTURE */

diff --git a/tests/validation/fixtures/Gaussian3x3Fixture.h b/tests/validation/fixtures/Gaussian3x3Fixture.h
deleted file mode 100644
index 4a154ea..0000000
--- a/tests/validation/fixtures/Gaussian3x3Fixture.h
+++ /dev/null

@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_GAUSSIAN3X3_FIXTURE
-#define ARM_COMPUTE_TEST_GAUSSIAN3X3_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/Gaussian3x3.h"
-
-#include <random>
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class Gaussian3x3ValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, DataType data_type, BorderMode border_mode)
-    {
-        std::mt19937                           gen(library->seed());
-        std::uniform_int_distribution<uint8_t> distribution(0, 255);
-        const uint8_t                          constant_border_value = distribution(gen);
-
-        _border_mode = border_mode;
-        _target      = compute_target(shape, data_type, border_mode, constant_border_value);
-        _reference   = compute_reference(shape, data_type, border_mode, constant_border_value);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor)
-    {
-        library->fill_tensor_uniform(tensor, 0);
-    }
-
-    TensorType compute_target(const TensorShape &shape, DataType data_type, BorderMode border_mode, uint8_t constant_border_value)
-    {
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, data_type);
-        TensorType dst = create_tensor<TensorType>(shape, data_type);
-
-        // Create and configure function
-        FunctionType gaussian3x3;
-        gaussian3x3.configure(&src, &dst, border_mode, constant_border_value);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        dst.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src));
-
-        // Compute function
-        gaussian3x3.run();
-
-        return dst;
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape &shape, DataType data_type, BorderMode border_mode, uint8_t constant_border_value)
-    {
-        ARM_COMPUTE_ERROR_ON(data_type != DataType::U8);
-
-        // Create reference
-        SimpleTensor<T> src{ shape, data_type };
-
-        // Fill reference
-        fill(src);
-
-        // Compute reference
-        return reference::gaussian3x3<T>(src, border_mode, constant_border_value);
-    }
-
-    BorderMode      _border_mode{};
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_GAUSSIAN3X3_FIXTURE */

diff --git a/tests/validation/fixtures/Gaussian5x5Fixture.h b/tests/validation/fixtures/Gaussian5x5Fixture.h
deleted file mode 100644
index 68f91e1..0000000
--- a/tests/validation/fixtures/Gaussian5x5Fixture.h
+++ /dev/null

@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_GAUSSIAN5X5_FIXTURE
-#define ARM_COMPUTE_TEST_GAUSSIAN5X5_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/Gaussian5x5.h"
-
-#include <random>
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class Gaussian5x5ValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, DataType data_type, BorderMode border_mode)
-    {
-        std::mt19937                           gen(library->seed());
-        std::uniform_int_distribution<uint8_t> distribution(0, 255);
-        const uint8_t                          constant_border_value = distribution(gen);
-
-        _border_mode = border_mode;
-        _target      = compute_target(shape, data_type, border_mode, constant_border_value);
-        _reference   = compute_reference(shape, data_type, border_mode, constant_border_value);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor)
-    {
-        library->fill_tensor_uniform(tensor, 0);
-    }
-
-    TensorType compute_target(const TensorShape &shape, DataType data_type, BorderMode border_mode, uint8_t constant_border_value)
-    {
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, data_type);
-        TensorType dst = create_tensor<TensorType>(shape, data_type);
-
-        // Create and configure function
-        FunctionType gaussian5x5;
-        gaussian5x5.configure(&src, &dst, border_mode, constant_border_value);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        dst.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src));
-
-        // Compute function
-        gaussian5x5.run();
-
-        return dst;
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape &shape, DataType data_type, BorderMode border_mode, uint8_t constant_border_value)
-    {
-        ARM_COMPUTE_ERROR_ON(data_type != DataType::U8);
-
-        // Create reference
-        SimpleTensor<T> src{ shape, data_type };
-
-        // Fill reference
-        fill(src);
-
-        // Compute reference
-        return reference::gaussian5x5<T>(src, border_mode, constant_border_value);
-    }
-
-    BorderMode      _border_mode{};
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_GAUSSIAN5X5_FIXTURE */

diff --git a/tests/validation/fixtures/GaussianPyramidHalfFixture.h b/tests/validation/fixtures/GaussianPyramidHalfFixture.h
deleted file mode 100644
index f91b1d5..0000000
--- a/tests/validation/fixtures/GaussianPyramidHalfFixture.h
+++ /dev/null

@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_GAUSSIAN_PYRAMID_HALF_FIXTURE
-#define ARM_COMPUTE_TEST_GAUSSIAN_PYRAMID_HALF_FIXTURE
-
-#include "arm_compute/core/IPyramid.h"
-#include "arm_compute/core/PyramidInfo.h"
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/GaussianPyramidHalf.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T, typename PyramidType>
-class GaussianPyramidHalfValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, BorderMode border_mode, size_t num_levels)
-    {
-        std::mt19937                           gen(library->seed());
-        std::uniform_int_distribution<uint8_t> distribution(0, 255);
-        const uint8_t                          constant_border_value = distribution(gen);
-
-        _border_mode = border_mode;
-
-        // Compute target and reference
-        compute_target(shape, border_mode, constant_border_value, num_levels);
-        compute_reference(shape, border_mode, constant_border_value, num_levels);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor)
-    {
-        library->fill_tensor_uniform(tensor, 0);
-    }
-
-    void compute_target(const TensorShape &shape, BorderMode border_mode, uint8_t constant_border_value, size_t num_levels)
-    {
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, DataType::U8);
-
-        PyramidInfo pyramid_info(num_levels, SCALE_PYRAMID_HALF, shape, Format::U8);
-        _target.init(pyramid_info);
-
-        // Create and configure function
-        FunctionType gaussian_pyramid;
-
-        gaussian_pyramid.configure(&src, &_target, border_mode, constant_border_value);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        for(size_t i = 0; i < pyramid_info.num_levels(); ++i)
-        {
-            ARM_COMPUTE_EXPECT(_target.get_pyramid_level(i)->info()->is_resizable(), framework::LogLevel::ERRORS);
-        }
-
-        // Allocate input tensor
-        src.allocator()->allocate();
-
-        // Allocate pyramid
-        _target.allocate();
-
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        for(size_t i = 0; i < pyramid_info.num_levels(); ++i)
-        {
-            ARM_COMPUTE_EXPECT(!_target.get_pyramid_level(i)->info()->is_resizable(), framework::LogLevel::ERRORS);
-        }
-
-        // Fill tensors
-        fill(AccessorType(src));
-
-        // Compute function
-        gaussian_pyramid.run();
-    }
-
-    void compute_reference(const TensorShape &shape, BorderMode border_mode, uint8_t constant_border_value, size_t num_levels)
-    {
-        // Create reference
-        SimpleTensor<T> src{ shape, DataType::U8 };
-
-        // Fill reference
-        fill(src);
-
-        _reference = reference::gaussian_pyramid_half<T>(src, border_mode, constant_border_value, num_levels);
-    }
-
-    PyramidType                  _target{};
-    std::vector<SimpleTensor<T>> _reference{};
-    BorderMode                   _border_mode{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_GAUSSIAN_PYRAMID_HALF_FIXTURE */
\ No newline at end of file

diff --git a/tests/validation/fixtures/HOGDescriptorFixture.h b/tests/validation/fixtures/HOGDescriptorFixture.h
deleted file mode 100644
index 1021e12..0000000
--- a/tests/validation/fixtures/HOGDescriptorFixture.h
+++ /dev/null

@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_HOG_DESCRIPTOR_FIXTURE
-#define ARM_COMPUTE_TEST_HOG_DESCRIPTOR_FIXTURE
-
-#include "arm_compute/core/HOGInfo.h"
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/HOGDescriptor.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename HOGType, typename AccessorType, typename FunctionType, typename T, typename U>
-class HOGDescriptorValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(std::string image, HOGInfo hog_info, Format format, BorderMode border_mode)
-    {
-        // Only defined borders supported
-        ARM_COMPUTE_ERROR_ON(border_mode == BorderMode::UNDEFINED);
-
-        // Generate a random constant value
-        std::mt19937                     gen(library->seed());
-        std::uniform_int_distribution<T> int_dist(0, 255);
-        const T                          constant_border_value = int_dist(gen);
-
-        _target    = compute_target(image, format, border_mode, constant_border_value, hog_info);
-        _reference = compute_reference(image, format, border_mode, constant_border_value, hog_info);
-    }
-
-protected:
-    template <typename V>
-    void fill(V &&tensor, const std::string image, Format format)
-    {
-        library->fill(tensor, image, format);
-    }
-
-    template <typename V, typename D>
-    void fill(V &&tensor, int i, D max)
-    {
-        library->fill_tensor_uniform(tensor, i, static_cast<D>(0), max);
-    }
-
-    TensorType compute_target(const std::string image, Format &format, BorderMode &border_mode, T constant_border_value, const HOGInfo &hog_info)
-    {
-        // Get image shape for src tensor
-        TensorShape shape = library->get_image_shape(image);
-
-        // Create tensor info for HOG descriptor
-        TensorInfo tensor_info_hog_descriptor(hog_info, shape.x(), shape.y());
-
-        // Create HOG
-        HOGType hog = create_HOG<HOGType>(hog_info);
-
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, data_type_from_format(format));
-        TensorType dst = create_tensor<TensorType>(tensor_info_hog_descriptor.tensor_shape(), DataType::F32, tensor_info_hog_descriptor.num_channels());
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Create and configure function
-        FunctionType hog_descriptor;
-        hog_descriptor.configure(&src, &dst, &hog, border_mode, constant_border_value);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        dst.allocator()->allocate();
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        const T max = std::numeric_limits<T>::max();
-
-        // Fill tensors
-        fill(AccessorType(src), image, format);
-        fill(AccessorType(dst), 1, static_cast<U>(max));
-
-        // Compute function
-        hog_descriptor.run();
-
-        return dst;
-    }
-
-    SimpleTensor<U> compute_reference(const std::string image, Format format, BorderMode border_mode, T constant_border_value, const HOGInfo &hog_info)
-    {
-        // Create reference
-        SimpleTensor<T> src{ library->get_image_shape(image), data_type_from_format(format) };
-
-        // Fill reference
-        fill(src, image, format);
-
-        return reference::hog_descriptor<U>(src, border_mode, constant_border_value, hog_info);
-    }
-
-    TensorType      _target{};
-    SimpleTensor<U> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_HOG_DESCRIPTOR_FIXTURE */

diff --git a/tests/validation/fixtures/HOGDetectorFixture.h b/tests/validation/fixtures/HOGDetectorFixture.h
deleted file mode 100644
index f12e65b..0000000
--- a/tests/validation/fixtures/HOGDetectorFixture.h
+++ /dev/null

@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_HOG_DETECTOR_FIXTURE
-#define ARM_COMPUTE_TEST_HOG_DETECTOR_FIXTURE
-
-#include "arm_compute/core/HOGInfo.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/IHOGAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/fixtures/HOGDescriptorFixture.h"
-#include "tests/validation/reference/HOGDetector.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType,
-          typename HOGType,
-          typename DetectionWindowArrayType,
-          typename HOGDescriptorType,
-          typename AccessorType,
-          typename ArrayAccessorType,
-          typename HOGAccessorType,
-          typename HOGDetectorType,
-          typename T,
-          typename U>
-class HOGDetectorValidationFixture : public HOGDescriptorValidationFixture<TensorType, HOGType, AccessorType, HOGDescriptorType, T, U>
-{
-public:
-    template <typename...>
-    void setup(Size2D detection_window_stride, std::string image, HOGInfo hog_info, Format format, BorderMode border_mode)
-    {
-        using HDF = HOGDescriptorValidationFixture<TensorType, HOGType, AccessorType, HOGDescriptorType, T, U>;
-        HDF::setup(image, hog_info, format, border_mode);
-
-        const unsigned int max_num_detection_windows = 100000;
-
-        // Initialise descriptor (linear SVM coefficients).
-        // NOTE: Fixed values are used to keep the number of detection windows detected
-        // consistent in order to have meaningful validation tolerances.
-        // The values are "unbalanced" to reduce the number of detected objects
-        std::random_device::result_type seed       = 0;
-        std::vector<U>                  descriptor = generate_random_real(hog_info.descriptor_size(), -0.505f, 0.495f, seed);
-
-        // Compute target and reference values using feature vector from descriptor kernel
-        _target    = compute_target(HDF::_target, descriptor, max_num_detection_windows, hog_info, detection_window_stride);
-        _reference = compute_reference(HDF::_reference, descriptor, max_num_detection_windows, hog_info, detection_window_stride);
-    }
-
-protected:
-    std::vector<DetectionWindow> compute_target(const TensorType &src, const std::vector<U> &descriptor, unsigned int max_num_detection_windows,
-                                                const HOGInfo &hog_info, const Size2D &detection_window_stride)
-    {
-        // Create HOG
-        HOGType hog = create_HOG<HOGType>(hog_info);
-
-        // Create array of detection windows
-        DetectionWindowArrayType detection_windows(max_num_detection_windows);
-
-        // Copy HOG descriptor values to HOG memory
-        {
-            HOGAccessorType hog_accessor(hog);
-            std::memcpy(hog_accessor.descriptor(), descriptor.data(), descriptor.size() * sizeof(U));
-        }
-
-        // Create and configure function
-        HOGDetectorType hog_detector;
-        hog_detector.configure(&src, &hog, &detection_windows, detection_window_stride);
-
-        // Reset detection windows
-        detection_windows.clear();
-
-        // Compute function
-        hog_detector.run();
-
-        // Create array of detection windows
-        std::vector<DetectionWindow> windows;
-
-        // Copy detection windows
-        ArrayAccessorType accessor(detection_windows);
-
-        for(size_t i = 0; i < accessor.num_values(); i++)
-        {
-            DetectionWindow win;
-            win.x         = accessor.at(i).x;
-            win.y         = accessor.at(i).y;
-            win.width     = accessor.at(i).width;
-            win.height    = accessor.at(i).height;
-            win.idx_class = accessor.at(i).idx_class;
-            win.score     = accessor.at(i).score;
-
-            windows.push_back(win);
-        }
-
-        return windows;
-    }
-
-    std::vector<DetectionWindow> compute_reference(const SimpleTensor<U> &src, const std::vector<U> &descriptor, unsigned int max_num_detection_windows,
-                                                   const HOGInfo &hog_info, const Size2D &detection_window_stride)
-    {
-        // Assumes defaults value of zero for threshold and class_idx.
-        return reference::hog_detector(src, descriptor, max_num_detection_windows, hog_info, detection_window_stride);
-    }
-
-    std::vector<DetectionWindow> _target{};
-    std::vector<DetectionWindow> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_HOG_DETECTOR_FIXTURE */

diff --git a/tests/validation/fixtures/HOGMultiDetectionFixture.h b/tests/validation/fixtures/HOGMultiDetectionFixture.h
deleted file mode 100644
index c37bdb6..0000000
--- a/tests/validation/fixtures/HOGMultiDetectionFixture.h
+++ /dev/null

@@ -1,193 +0,0 @@
-/*
- * Copyright (c) 2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_HOG_MULTI_DETECTION_FIXTURE
-#define ARM_COMPUTE_TEST_HOG_MULTI_DETECTION_FIXTURE
-
-#include "arm_compute/core/HOGInfo.h"
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/IHOGAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/HOGMultiDetection.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType,
-          typename HOGType,
-          typename MultiHOGType,
-          typename DetectionWindowArrayType,
-          typename DetectionWindowStrideType,
-          typename AccessorType,
-          typename Size2DArrayAccessorType,
-          typename DetectionWindowArrayAccessorType,
-          typename HOGAccessorType,
-          typename FunctionType,
-          typename T,
-          typename U>
-class HOGMultiDetectionValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(std::string image, std::vector<HOGInfo> models, Format format, BorderMode border_mode, bool non_maxima_suppression)
-    {
-        // Only defined borders supported
-        ARM_COMPUTE_ERROR_ON(border_mode == BorderMode::UNDEFINED);
-
-        // Generate a random constant value
-        std::mt19937                     gen(library->seed());
-        std::uniform_int_distribution<T> int_dist(0, 255);
-        const T                          constant_border_value = int_dist(gen);
-
-        // Initialize descriptors vector
-        std::vector<std::vector<U>> descriptors(models.size());
-
-        // Use default values for threshold and min_distance
-        const float threshold    = 0.f;
-        const float min_distance = 1.f;
-
-        // Maximum number of detection windows per batch
-        const unsigned int max_num_detection_windows = 100000;
-
-        _target    = compute_target(image, format, border_mode, constant_border_value, models, descriptors, max_num_detection_windows, threshold, non_maxima_suppression, min_distance);
-        _reference = compute_reference(image, format, border_mode, constant_border_value, models, descriptors, max_num_detection_windows, threshold, non_maxima_suppression, min_distance);
-    }
-
-protected:
-    template <typename V>
-    void fill(V &&tensor, const std::string image, Format format)
-    {
-        library->fill(tensor, image, format);
-    }
-
-    void initialize_batch(const std::vector<HOGInfo> &models, MultiHOGType &multi_hog,
-                          std::vector<std::vector<U>> &descriptors, DetectionWindowStrideType &detection_window_strides)
-    {
-        for(unsigned i = 0; i < models.size(); ++i)
-        {
-            auto hog_model = reinterpret_cast<HOGType *>(multi_hog.model(i));
-            hog_model->init(models[i]);
-
-            // Initialise descriptor (linear SVM coefficients).
-            std::random_device::result_type seed = 0;
-            descriptors.at(i)                    = generate_random_real(models[i].descriptor_size(), -0.505f, 0.495f, seed);
-
-            // Copy HOG descriptor values to HOG memory
-            {
-                HOGAccessorType hog_accessor(*hog_model);
-                std::memcpy(hog_accessor.descriptor(), descriptors.at(i).data(), descriptors.at(i).size() * sizeof(U));
-            }
-
-            // Initialize detection window stride
-            Size2DArrayAccessorType accessor(detection_window_strides);
-            accessor.at(i) = models[i].block_stride();
-        }
-    }
-
-    std::vector<DetectionWindow> compute_target(const std::string image, Format &format, BorderMode &border_mode, T constant_border_value,
-                                                const std::vector<HOGInfo> &models, std::vector<std::vector<U>> &descriptors, unsigned int max_num_detection_windows,
-                                                float threshold, bool non_max_suppression, float min_distance)
-    {
-        MultiHOGType              multi_hog(models.size());
-        DetectionWindowArrayType  detection_windows(max_num_detection_windows);
-        DetectionWindowStrideType detection_window_strides(models.size());
-
-        // Resize detection window_strides for index access
-        detection_window_strides.resize(models.size());
-
-        // Initialiize MultiHOG and detection windows
-        initialize_batch(models, multi_hog, descriptors, detection_window_strides);
-
-        // Get image shape for src tensor
-        TensorShape shape = library->get_image_shape(image);
-
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, data_type_from_format(format));
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Create and configure function
-        FunctionType hog_multi_detection;
-        hog_multi_detection.configure(&src, &multi_hog, &detection_windows, &detection_window_strides, border_mode, constant_border_value, threshold, non_max_suppression, min_distance);
-
-        // Reset detection windows
-        detection_windows.clear();
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src), image, format);
-
-        // Compute function
-        hog_multi_detection.run();
-
-        // Copy detection windows
-        std::vector<DetectionWindow>     windows;
-        DetectionWindowArrayAccessorType accessor(detection_windows);
-
-        for(size_t i = 0; i < accessor.num_values(); i++)
-        {
-            DetectionWindow win;
-            win.x         = accessor.at(i).x;
-            win.y         = accessor.at(i).y;
-            win.width     = accessor.at(i).width;
-            win.height    = accessor.at(i).height;
-            win.idx_class = accessor.at(i).idx_class;
-            win.score     = accessor.at(i).score;
-
-            windows.push_back(win);
-        }
-
-        return windows;
-    }
-
-    std::vector<DetectionWindow> compute_reference(const std::string image, Format format, BorderMode border_mode, T constant_border_value,
-                                                   const std::vector<HOGInfo> &models, const std::vector<std::vector<U>> &descriptors, unsigned int max_num_detection_windows,
-                                                   float threshold, bool non_max_suppression, float min_distance)
-    {
-        // Create reference
-        SimpleTensor<T> src{ library->get_image_shape(image), data_type_from_format(format) };
-
-        // Fill reference
-        fill(src, image, format);
-
-        // NOTE: Detection window stride fixed to block stride
-        return reference::hog_multi_detection(src, border_mode, constant_border_value, models, descriptors, max_num_detection_windows, threshold, non_max_suppression, min_distance);
-    }
-
-    std::vector<DetectionWindow> _target{};
-    std::vector<DetectionWindow> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_HOG_MULTI_DETECTION_FIXTURE */

diff --git a/tests/validation/fixtures/HarrisCornersFixture.h b/tests/validation/fixtures/HarrisCornersFixture.h
deleted file mode 100644
index dbe77dd..0000000
--- a/tests/validation/fixtures/HarrisCornersFixture.h
+++ /dev/null

@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_HARRIS_CORNERS_FIXTURE
-#define ARM_COMPUTE_TEST_HARRIS_CORNERS_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/Helpers.h"
-#include "tests/validation/reference/HarrisCornerDetector.h"
-
-namespace arm_compute
-{
-class CLHarrisCorners;
-class NEHarrisCorners;
-
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename ArrayType, typename FunctionType, typename T>
-class HarrisCornersValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(std::string image, int gradient_size, int block_size, BorderMode border_mode, Format format)
-    {
-        HarrisCornersParameters params = harris_corners_parameters();
-
-        _target    = compute_target(image, gradient_size, block_size, border_mode, format, params);
-        _reference = compute_reference(image, gradient_size, block_size, border_mode, format, params);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor, RawTensor raw)
-    {
-        library->fill(tensor, raw);
-    }
-
-    ArrayType compute_target(std::string image, int gradient_size, int block_size, BorderMode border_mode, Format format, const HarrisCornersParameters &params)
-    {
-        // Load the image (cached by the library if loaded before)
-        const RawTensor &raw = library->get(image, format);
-
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(raw.shape(), format);
-
-        // Create array of keypoints
-        ArrayType corners(raw.shape().total_size());
-
-        // Create harris corners configure function
-        FunctionType harris_corners;
-        harris_corners.configure(&src, params.threshold, params.min_dist, params.sensitivity, gradient_size, block_size, &corners, border_mode, params.constant_border_value);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src), raw);
-
-        // Compute function
-        harris_corners.run();
-
-        return corners;
-    }
-
-    std::vector<KeyPoint> compute_reference(std::string image, int gradient_size, int block_size, BorderMode border_mode, Format format, const HarrisCornersParameters &params)
-    {
-        // Load the image (cached by the library if loaded before)
-        const RawTensor &raw = library->get(image, format);
-        // Create reference
-        SimpleTensor<T> src{ raw.shape(), format };
-
-        // Fill reference
-        fill(src, raw);
-
-        return reference::harris_corner_detector<T>(src, params.threshold, params.min_dist, params.sensitivity, gradient_size, block_size, border_mode, params.constant_border_value);
-    }
-
-    ArrayType             _target{};
-    std::vector<KeyPoint> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_HARRIS_CORNERS_FIXTURE */

diff --git a/tests/validation/fixtures/HistogramFixture.h b/tests/validation/fixtures/HistogramFixture.h
deleted file mode 100644
index dceb23b..0000000
--- a/tests/validation/fixtures/HistogramFixture.h
+++ /dev/null

@@ -1,129 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_HISTOGRAM_FIXTURE
-#define ARM_COMPUTE_TEST_HISTOGRAM_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/Histogram.h"
-#include "utils/Utils.h"
-
-#include <random>
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T, typename DistributionType>
-class HistogramValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, DataType data_type)
-    {
-        std::mt19937                            gen(library->seed());
-        std::uniform_int_distribution<size_t>   distribution_size_t(1, 30);
-        const size_t                            num_bins = distribution_size_t(gen);
-        std::uniform_int_distribution<int32_t>  distribution_int32_t(0, 125);
-        const size_t                            offset = distribution_int32_t(gen);
-        std::uniform_int_distribution<uint32_t> distribution_uint32_t(1, 255 - offset);
-        const size_t                            range = distribution_uint32_t(gen);
-
-        _target    = compute_target(shape, data_type, num_bins, offset, range);
-        _reference = compute_reference(shape, data_type, num_bins, offset, range);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor)
-    {
-        library->fill_tensor_uniform(tensor, 0);
-    }
-
-    TensorType compute_target(const TensorShape &shape, DataType data_type, size_t num_bins, int32_t offset, uint32_t range)
-    {
-        // Create tensors
-        TensorType       src = create_tensor<TensorType>(shape, data_type);
-        TensorType       dst = create_tensor<TensorType>(TensorShape(num_bins), DataType::U32);
-        DistributionType distribution_dst(num_bins, offset, range);
-
-        // Create and configure function
-        FunctionType histogram;
-        histogram.configure(&src, &distribution_dst);
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        dst.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src));
-
-        // Compute function
-        histogram.run();
-
-        // Copy the distribution in a tensor
-        arm_compute::utils::map(distribution_dst, true);
-        AccessorType accessor_dst = AccessorType(dst);
-        uint32_t    *dst_data     = static_cast<uint32_t *>(accessor_dst.data());
-
-        ARM_COMPUTE_EXPECT(accessor_dst.size() <= dst.info()->total_size(), framework::LogLevel::ERRORS);
-
-        std::copy_n(distribution_dst.buffer(), num_bins, dst_data);
-        arm_compute::utils::unmap(distribution_dst);
-        return dst;
-    }
-
-    SimpleTensor<uint32_t> compute_reference(const TensorShape &shape, DataType data_type, size_t num_bins, int32_t offset, uint32_t range)
-    {
-        ARM_COMPUTE_ERROR_ON(data_type != DataType::U8);
-
-        // Create reference
-        SimpleTensor<T> src{ shape, data_type };
-
-        // Fill reference
-        fill(src);
-
-        // Compute reference
-        return reference::histogram<T>(src, num_bins, offset, range);
-    }
-
-    TensorType             _target{};
-    SimpleTensor<uint32_t> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_HISTOGRAM_FIXTURE */

diff --git a/tests/validation/fixtures/IntegralImageFixture.h b/tests/validation/fixtures/IntegralImageFixture.h
deleted file mode 100644
index abc9973..0000000
--- a/tests/validation/fixtures/IntegralImageFixture.h
+++ /dev/null

@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_INTEGRAL_IMAGE_FIXTURE
-#define ARM_COMPUTE_TEST_INTEGRAL_IMAGE_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/IntegralImage.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class IntegralImageValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, DataType data_type)
-    {
-        _target    = compute_target(shape);
-        _reference = compute_reference(shape, data_type);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor)
-    {
-        library->fill_tensor_uniform(tensor, 0);
-    }
-
-    TensorType compute_target(const TensorShape &shape)
-    {
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, DataType::U8);
-        TensorType dst = create_tensor<TensorType>(shape, DataType::U32);
-
-        // Create and configure function
-        FunctionType integral_image;
-        integral_image.configure(&src, &dst);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        dst.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src));
-
-        // Compute function
-        integral_image.run();
-
-        return dst;
-    }
-
-    SimpleTensor<uint32_t> compute_reference(const TensorShape &shape, DataType data_type)
-    {
-        // Create reference
-        SimpleTensor<T> src{ shape, data_type };
-
-        // Fill reference
-        fill(src);
-
-        return reference::integral_image<T>(src);
-    }
-
-    TensorType             _target{};
-    SimpleTensor<uint32_t> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_INTEGRAL_IMAGE_FIXTURE */

diff --git a/tests/validation/fixtures/LaplacianPyramidFixture.h b/tests/validation/fixtures/LaplacianPyramidFixture.h
deleted file mode 100644
index 7131996..0000000
--- a/tests/validation/fixtures/LaplacianPyramidFixture.h
+++ /dev/null

@@ -1,154 +0,0 @@
-/*
- * Copyright (c) 2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_LAPLACIAN_PYRAMID_FIXTURE
-#define ARM_COMPUTE_TEST_LAPLACIAN_PYRAMID_FIXTURE
-
-#include "arm_compute/core/IPyramid.h"
-#include "arm_compute/core/PyramidInfo.h"
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/LaplacianPyramid.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T, typename U, typename PyramidType>
-class LaplacianPyramidValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape input_shape, BorderMode border_mode, size_t num_levels, Format format_in, Format format_out)
-    {
-        std::mt19937                     generator(library->seed());
-        std::uniform_int_distribution<T> distribution_u8(0, 255);
-        const T                          constant_border_value = distribution_u8(generator);
-
-        _pyramid_levels = num_levels;
-        _border_mode    = border_mode;
-
-        _target    = compute_target(input_shape, border_mode, constant_border_value, format_in, format_out);
-        _reference = compute_reference(input_shape, border_mode, constant_border_value, format_in, format_out);
-    }
-
-protected:
-    template <typename V>
-    void fill(V &&tensor)
-    {
-        library->fill_tensor_uniform(tensor, 0);
-    }
-
-    PyramidType compute_target(const TensorShape &input_shape, BorderMode border_mode, T constant_border_value,
-                               Format format_in, Format format_out)
-    {
-        // Create pyramid
-        PyramidType pyramid{};
-
-        // Create Pyramid Info
-        PyramidInfo pyramid_info(_pyramid_levels, SCALE_PYRAMID_HALF, input_shape, format_out);
-
-        // Use conservative padding strategy to fit all subsequent kernels
-        pyramid.init_auto_padding(pyramid_info);
-
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(input_shape, format_in);
-
-        // The first two dimensions of the output tensor must match the first
-        // two dimensions of the tensor in the last level of the pyramid
-        TensorShape dst_shape(input_shape);
-        dst_shape.set(0, pyramid.get_pyramid_level(_pyramid_levels - 1)->info()->dimension(0));
-        dst_shape.set(1, pyramid.get_pyramid_level(_pyramid_levels - 1)->info()->dimension(1));
-
-        // The lowest resolution tensor necessary to reconstruct the input
-        // tensor from the pyramid.
-        _dst_target = create_tensor<TensorType>(dst_shape, format_out);
-
-        // Create and configure function
-        FunctionType laplacian_pyramid;
-        laplacian_pyramid.configure(&src, &pyramid, &_dst_target, border_mode, constant_border_value);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(_dst_target.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        _dst_target.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!_dst_target.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        pyramid.allocate();
-
-        for(size_t i = 0; i < pyramid_info.num_levels(); ++i)
-        {
-            ARM_COMPUTE_EXPECT(!pyramid.get_pyramid_level(i)->info()->is_resizable(), framework::LogLevel::ERRORS);
-        }
-
-        // Fill tensors
-        fill(AccessorType(src));
-
-        // Compute function
-        laplacian_pyramid.run();
-
-        return pyramid;
-    }
-
-    std::vector<SimpleTensor<U>> compute_reference(const TensorShape &shape, BorderMode border_mode, T constant_border_value,
-                                                   Format format_in, Format format_out)
-    {
-        // Create reference
-        SimpleTensor<T> src{ shape, format_in };
-
-        // The first two dimensions of the output tensor must match the first
-        // two dimensions of the tensor in the last level of the pyramid
-        TensorShape dst_shape(shape);
-        dst_shape.set(0, static_cast<float>(shape[0] + 1) / static_cast<float>(std::pow(2, _pyramid_levels - 1)));
-        dst_shape.set(1, static_cast<float>(shape[1] + 1) / static_cast<float>(std::pow(2, _pyramid_levels - 1)));
-
-        _dst_reference = SimpleTensor<U>(dst_shape, format_out);
-
-        // Fill reference
-        fill(src);
-
-        return reference::laplacian_pyramid<T, U>(src, _dst_reference, _pyramid_levels, border_mode, constant_border_value);
-    }
-
-    size_t                       _pyramid_levels{};
-    BorderMode                   _border_mode{};
-    SimpleTensor<U>              _dst_reference{};
-    TensorType                   _dst_target{};
-    PyramidType                  _target{};
-    std::vector<SimpleTensor<U>> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_LAPLACIAN_PYRAMID_FIXTURE */

diff --git a/tests/validation/fixtures/LaplacianReconstructFixture.h b/tests/validation/fixtures/LaplacianReconstructFixture.h
deleted file mode 100644
index 35432ee..0000000
--- a/tests/validation/fixtures/LaplacianReconstructFixture.h
+++ /dev/null

@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_LAPLACIAN_RECONSTRUCT_FIXTURE
-#define ARM_COMPUTE_TEST_LAPLACIAN_RECONSTRUCT_FIXTURE
-
-#include "arm_compute/core/IPyramid.h"
-#include "arm_compute/core/PyramidInfo.h"
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/fixtures/LaplacianPyramidFixture.h"
-#include "tests/validation/reference/LaplacianPyramid.h"
-#include "tests/validation/reference/LaplacianReconstruct.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename LaplacianPyramidType, typename T, typename U, typename PyramidType>
-class LaplacianReconstructValidationFixture : public LaplacianPyramidValidationFixture<TensorType, AccessorType, LaplacianPyramidType, U, T, PyramidType>
-{
-public:
-    template <typename...>
-    void setup(TensorShape input_shape, BorderMode border_mode, size_t num_levels, Format format_in, Format format_out)
-    {
-        std::mt19937                     generator(library->seed());
-        std::uniform_int_distribution<U> distribution_u8(0, 255);
-        const U                          constant_border_value = distribution_u8(generator);
-
-        using LPF = LaplacianPyramidValidationFixture<TensorType, AccessorType, LaplacianPyramidType, U, T, PyramidType>;
-        LPF::setup(input_shape, border_mode, num_levels, format_out, format_in);
-
-        // Compute target and reference values using the pyramid and lowest
-        // resolution tensor output from Laplacian Pyramid kernel
-        _target    = compute_target(input_shape, LPF::_target, LPF::_dst_target, border_mode, constant_border_value);
-        _reference = compute_reference(LPF::_reference, LPF::_dst_reference, border_mode, constant_border_value);
-    }
-
-protected:
-    template <typename V>
-    void fill(V &&tensor)
-    {
-        library->fill_tensor_uniform(tensor, 0);
-    }
-
-    TensorType compute_target(const TensorShape &input_shape, PyramidType &pyramid, TensorType &low_res, BorderMode border_mode, U constant_border_value)
-    {
-        // Create tensors
-        TensorType dst = create_tensor<TensorType>(input_shape, DataType::U8);
-
-        // Create and configure function
-        FunctionType laplacian_reconstruct;
-        laplacian_reconstruct.configure(&pyramid, &low_res, &dst, border_mode, constant_border_value);
-
-        // Allocate tensors
-        dst.allocator()->allocate();
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Compute function
-        laplacian_reconstruct.run();
-
-        return dst;
-    }
-
-    SimpleTensor<U> compute_reference(const std::vector<SimpleTensor<T>> &pyramid,
-                                      const SimpleTensor<T> &low_res, BorderMode border_mode, U constant_border_value)
-    {
-        return reference::laplacian_reconstruct<T, U>(pyramid, low_res, border_mode, constant_border_value);
-    }
-
-    TensorType      _target{};
-    SimpleTensor<U> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_LAPLACIAN_RECONSTRUCT_FIXTURE */

diff --git a/tests/validation/fixtures/MagnitudeFixture.h b/tests/validation/fixtures/MagnitudeFixture.h
deleted file mode 100644
index 81f4970..0000000
--- a/tests/validation/fixtures/MagnitudeFixture.h
+++ /dev/null

@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_MAGNITUDE_FIXTURE
-#define ARM_COMPUTE_TEST_MAGNITUDE_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/Magnitude.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class MagnitudeValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, Format format, MagnitudeType magnitude_type)
-    {
-        _target         = compute_target(shape, format, magnitude_type);
-        _reference      = compute_reference(shape, format, magnitude_type);
-        _magnitude_type = magnitude_type;
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor, std::random_device::result_type seed_offset)
-    {
-        library->fill_tensor_uniform(tensor, seed_offset);
-    }
-
-    TensorType compute_target(const TensorShape &shape, Format format, MagnitudeType magnitude_type)
-    {
-        DataType data_type = data_type_from_format(format);
-
-        // Create tensors
-        TensorType src1 = create_tensor<TensorType>(shape, data_type);
-        src1.info()->set_format(format);
-
-        TensorType src2 = create_tensor<TensorType>(shape, data_type);
-        src2.info()->set_format(format);
-
-        TensorType dst = create_tensor<TensorType>(shape, data_type);
-        dst.info()->set_format(format);
-
-        // Create and configure function
-        FunctionType magnitude;
-        magnitude.configure(&src1, &src2, &dst, magnitude_type);
-
-        ARM_COMPUTE_EXPECT(src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src1.allocator()->allocate();
-        src2.allocator()->allocate();
-        dst.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src1), 0);
-        fill(AccessorType(src2), 1);
-
-        // Compute function
-        magnitude.run();
-
-        return dst;
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape &shape, Format format, MagnitudeType magnitude_type)
-    {
-        DataType data_type = data_type_from_format(format);
-
-        // Create reference
-        SimpleTensor<T> src1{ shape, data_type };
-        SimpleTensor<T> src2{ shape, data_type };
-
-        // Fill reference
-        fill(src1, 0);
-        fill(src2, 1);
-
-        return reference::magnitude<T>(src1, src2, magnitude_type);
-    }
-
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-    MagnitudeType   _magnitude_type{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_MAGNITUDE_FIXTURE */

diff --git a/tests/validation/fixtures/MeanStdDevFixture.h b/tests/validation/fixtures/MeanStdDevFixture.h
deleted file mode 100644
index f3facc5..0000000
--- a/tests/validation/fixtures/MeanStdDevFixture.h
+++ /dev/null

@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_MEAN_STD_DEV_FIXTURE
-#define ARM_COMPUTE_TEST_MEAN_STD_DEV_FIXTURE
-
-#include "tests/Globals.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/MeanStdDev.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class MeanStdDevValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, DataType data_type)
-    {
-        _target    = compute_target(shape, data_type);
-        _reference = compute_reference(shape, data_type);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor)
-    {
-        if(tensor.data_type() == DataType::F32)
-        {
-            std::uniform_real_distribution<float> distribution(-1.0f, 1.0f);
-            library->fill(tensor, distribution, 0);
-        }
-        else if(tensor.data_type() == DataType::F16)
-        {
-            arm_compute::utils::uniform_real_distribution_16bit<half> distribution{ -1.0f, 1.0f };
-            library->fill(tensor, distribution, 0);
-        }
-        else
-        {
-            library->fill_tensor_uniform(tensor, 0);
-        }
-    }
-
-    std::pair<float, float> compute_target(const TensorShape &shape, DataType data_type)
-    {
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, data_type);
-
-        // Create output variables
-        float mean    = 0.0f;
-        float std_dev = 0.0f;
-
-        // Create and configure function
-        FunctionType mean_std_dev;
-        mean_std_dev.configure(&src, &mean, &std_dev);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src));
-
-        // Compute function
-        mean_std_dev.run();
-
-        return std::make_pair(mean, std_dev);
-    }
-
-    std::pair<float, float> compute_reference(const TensorShape &shape, DataType data_type)
-    {
-        // Create reference
-        SimpleTensor<T> src{ shape, data_type };
-
-        // Fill reference
-        fill(src);
-
-        // Compute reference
-        return reference::mean_and_standard_deviation<T>(src);
-    }
-
-    std::pair<float, float> _target{};
-    std::pair<float, float> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_MEAN_STD_DEV_FIXTURE */

diff --git a/tests/validation/fixtures/Median3x3Fixture.h b/tests/validation/fixtures/Median3x3Fixture.h
deleted file mode 100644
index 2b97800..0000000
--- a/tests/validation/fixtures/Median3x3Fixture.h
+++ /dev/null

@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_MEDIAN3X3_FIXTURE
-#define ARM_COMPUTE_TEST_MEDIAN3X3_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/Median3x3.h"
-
-#include <random>
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class Median3x3ValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, DataType data_type, BorderMode border_mode)
-    {
-        std::mt19937                           gen(library->seed());
-        std::uniform_int_distribution<uint8_t> distribution(0, 255);
-        const uint8_t                          constant_border_value = distribution(gen);
-
-        _border_mode = border_mode;
-        _target      = compute_target(shape, data_type, border_mode, constant_border_value);
-        _reference   = compute_reference(shape, data_type, border_mode, constant_border_value);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor)
-    {
-        library->fill_tensor_uniform(tensor, 0);
-    }
-
-    TensorType compute_target(const TensorShape &shape, DataType data_type, BorderMode border_mode, uint8_t constant_border_value)
-    {
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, data_type);
-        TensorType dst = create_tensor<TensorType>(shape, data_type);
-
-        // Create and configure function
-        FunctionType median3x3;
-        median3x3.configure(&src, &dst, border_mode, constant_border_value);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        dst.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src));
-
-        // Compute function
-        median3x3.run();
-
-        return dst;
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape &shape, DataType data_type, BorderMode border_mode, uint8_t constant_border_value)
-    {
-        ARM_COMPUTE_ERROR_ON(data_type != DataType::U8);
-
-        // Create reference
-        SimpleTensor<T> src{ shape, data_type };
-
-        // Fill reference
-        fill(src);
-
-        // Compute reference
-        return reference::median3x3<T>(src, border_mode, constant_border_value);
-    }
-
-    BorderMode      _border_mode{};
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_MEDIAN3X3_FIXTURE */

diff --git a/tests/validation/fixtures/MinMaxLocationFixture.h b/tests/validation/fixtures/MinMaxLocationFixture.h
deleted file mode 100644
index 73466cc..0000000
--- a/tests/validation/fixtures/MinMaxLocationFixture.h
+++ /dev/null

@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_MIN_MAX_LOCATION_FIXTURE
-#define ARM_COMPUTE_TEST_MIN_MAX_LOCATION_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/Types.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/MinMaxLocation.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename ArrayType, typename ArrayAccessorType, typename FunctionType, typename T>
-class MinMaxLocationValidationFixture : public framework::Fixture
-{
-public:
-    using target_type = typename std::conditional<std::is_integral<T>::value, int32_t, float>::type;
-
-    template <typename...>
-    void setup(TensorShape shape, DataType data_type)
-    {
-        _target    = compute_target(shape, data_type);
-        _reference = compute_reference(shape, data_type);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor)
-    {
-        library->fill_tensor_uniform(tensor, 0);
-    }
-
-    MinMaxLocationValues<target_type> compute_target(const TensorShape &shape, DataType data_type)
-    {
-        MinMaxLocationValues<target_type> target;
-
-        ArrayType min_loc(shape.total_size());
-        ArrayType max_loc(shape.total_size());
-
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, data_type);
-
-        // Create and configure function
-        FunctionType min_max_loc;
-        min_max_loc.configure(&src, &target.min, &target.max, &min_loc, &max_loc);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src));
-
-        // Compute function
-        min_max_loc.run();
-
-        // Create accessor objects for mapping operations
-        ArrayAccessorType min_loc_accessor(min_loc);
-        ArrayAccessorType max_loc_accessor(max_loc);
-
-        // Move min Coordinates2D values from ArrayType to vector
-        for(size_t i = 0; i < min_loc.num_values(); ++i)
-        {
-            target.min_loc.push_back(std::move(min_loc_accessor.at(i)));
-        }
-
-        // Move max Coordinates2D values from ArrayType to vector
-        for(size_t i = 0; i < max_loc.num_values(); ++i)
-        {
-            target.max_loc.push_back(std::move(max_loc_accessor.at(i)));
-        }
-
-        return target;
-    }
-
-    MinMaxLocationValues<T> compute_reference(const TensorShape &shape, DataType data_type)
-    {
-        // Create reference
-        SimpleTensor<T> src{ shape, data_type };
-
-        // Fill reference
-        fill(src);
-
-        return reference::min_max_location<T>(src);
-    }
-
-    MinMaxLocationValues<target_type> _target{};
-    MinMaxLocationValues<T>           _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_MIN_MAX_LOCATION_FIXTURE */

diff --git a/tests/validation/fixtures/NonLinearFilterFixture.h b/tests/validation/fixtures/NonLinearFilterFixture.h
deleted file mode 100644
index 03d2bcd..0000000
--- a/tests/validation/fixtures/NonLinearFilterFixture.h
+++ /dev/null

@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_NONLINEAR_FILTER_FIXTURE
-#define ARM_COMPUTE_TEST_NONLINEAR_FILTER_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/Helpers.h"
-#include "tests/validation/reference/NonLinearFilter.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class NonLinearFilterValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, BorderMode border_mode, DataType data_type)
-    {
-        std::mt19937                           generator(library->seed());
-        std::uniform_int_distribution<uint8_t> distribution_u8(0, 255);
-        const uint8_t                          constant_border_value = distribution_u8(generator);
-
-        // Create the mask
-        std::vector<uint8_t> mask(mask_size * mask_size);
-        fill_mask_from_pattern(mask.data(), mask_size, mask_size, pattern);
-
-        _border_size = BorderSize(static_cast<int>(mask_size / 2));
-        _target      = compute_target(shape, data_type, function, mask_size, pattern, mask.data(), border_mode, constant_border_value);
-        _reference   = compute_reference(shape, data_type, function, mask_size, pattern, mask.data(), border_mode, constant_border_value);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor)
-    {
-        library->fill_tensor_uniform(tensor, 0);
-    }
-
-    TensorType compute_target(const TensorShape &shape, DataType data_type, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, BorderMode border_mode,
-                              uint8_t constant_border_value)
-    {
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, data_type);
-        TensorType dst = create_tensor<TensorType>(shape, data_type);
-
-        // Create and configure function
-        FunctionType non_linear_filter;
-        non_linear_filter.configure(&src, &dst, function, mask_size, pattern, mask, border_mode, constant_border_value);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        dst.allocator()->allocate();
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src));
-
-        // Compute function
-        non_linear_filter.run();
-
-        return dst;
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape &shape, DataType data_type, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
-                                      BorderMode border_mode, uint8_t constant_border_value)
-    {
-        // Create reference
-        SimpleTensor<T> src{ shape, data_type };
-
-        // Fill reference
-        fill(src);
-
-        return reference::non_linear_filter<T>(src, function, mask_size, pattern, mask, border_mode, constant_border_value);
-    }
-
-    BorderMode      _border_mode{};
-    BorderSize      _border_size{};
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_NONLINEAR_FILTER_FIXTURE */

diff --git a/tests/validation/fixtures/OpticalFlowFixture.h b/tests/validation/fixtures/OpticalFlowFixture.h
deleted file mode 100644
index 5c3285a..0000000
--- a/tests/validation/fixtures/OpticalFlowFixture.h
+++ /dev/null

@@ -1,186 +0,0 @@
-/*
- * Copyright (c) 2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_OPTICAL_FLOW
-#define ARM_COMPUTE_TEST_OPTICAL_FLOW
-
-#include "arm_compute/core/PyramidInfo.h"
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/Types.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/OpticalFlow.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType,
-          typename AccessorType,
-          typename ArrayType,
-          typename ArrayAccessorType,
-          typename FunctionType,
-          typename PyramidType,
-          typename PyramidFunctionType,
-          typename T>
-
-class OpticalFlowValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(std::string old_image_name, std::string new_image_name, OpticalFlowParameters params,
-               size_t num_levels, size_t num_keypoints, Format format, BorderMode border_mode)
-    {
-        std::mt19937                           gen(library->seed());
-        std::uniform_int_distribution<uint8_t> int_dist(0, 255);
-        const uint8_t                          constant_border_value = int_dist(gen);
-
-        // Create keypoints
-        std::vector<KeyPoint> old_keypoints           = generate_random_keypoints(library->get_image_shape(old_image_name), num_keypoints, library->seed(), num_levels);
-        std::vector<KeyPoint> new_keypoints_estimates = old_keypoints;
-
-        _target    = compute_target(old_image_name, new_image_name, params, num_levels, old_keypoints, new_keypoints_estimates, format, border_mode, constant_border_value);
-        _reference = compute_reference(old_image_name, new_image_name, params, num_levels, old_keypoints, new_keypoints_estimates, format, border_mode, constant_border_value);
-    }
-
-protected:
-    template <typename V>
-    void fill(V &&tensor, const std::string image, Format format)
-    {
-        library->fill(tensor, image, format);
-    }
-
-    ArrayType compute_target(std::string old_image_name, std::string new_image_name, OpticalFlowParameters params, size_t num_levels,
-                             std::vector<KeyPoint> &old_keypoints, std::vector<KeyPoint> &new_keypoints_estimates,
-                             Format format, BorderMode border_mode, uint8_t constant_border_value)
-    {
-        // Get image shapes
-        TensorShape old_shape = library->get_image_shape(old_image_name);
-        TensorShape new_shape = library->get_image_shape(new_image_name);
-
-        // Create tensors
-        auto old_image = create_tensor<TensorType>(old_shape, format);
-        auto new_image = create_tensor<TensorType>(new_shape, format);
-
-        // Load keypoints
-        ArrayType old_points(old_keypoints.size());
-        ArrayType new_points_estimates(new_keypoints_estimates.size());
-        ArrayType new_points(old_keypoints.size());
-
-        fill_array(ArrayAccessorType(old_points), old_keypoints);
-        fill_array(ArrayAccessorType(new_points_estimates), new_keypoints_estimates);
-
-        // Create pyramid images
-        PyramidInfo pyramid_info(num_levels, SCALE_PYRAMID_HALF, old_image.info()->tensor_shape(), format);
-        PyramidType old_pyramid = create_pyramid<PyramidType>(pyramid_info);
-        PyramidType new_pyramid = create_pyramid<PyramidType>(pyramid_info);
-
-        // Create and configure pyramid functions
-        PyramidFunctionType old_gp;
-        old_gp.configure(&old_image, &old_pyramid, border_mode, constant_border_value);
-
-        PyramidFunctionType new_gp;
-        new_gp.configure(&new_image, &new_pyramid, border_mode, constant_border_value);
-
-        for(size_t i = 0; i < pyramid_info.num_levels(); ++i)
-        {
-            ARM_COMPUTE_EXPECT(old_pyramid.get_pyramid_level(i)->info()->is_resizable(), framework::LogLevel::ERRORS);
-            ARM_COMPUTE_EXPECT(new_pyramid.get_pyramid_level(i)->info()->is_resizable(), framework::LogLevel::ERRORS);
-        }
-
-        // Create and configure optical flow function
-        FunctionType optical_flow;
-
-        optical_flow.configure(&old_pyramid,
-                               &new_pyramid,
-                               &old_points,
-                               &new_points_estimates,
-                               &new_points,
-                               params.termination,
-                               params.epsilon,
-                               params.num_iterations,
-                               params.window_dimension,
-                               params.use_initial_estimate,
-                               border_mode,
-                               constant_border_value);
-
-        ARM_COMPUTE_EXPECT(old_image.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(new_image.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate input tensors
-        old_image.allocator()->allocate();
-        new_image.allocator()->allocate();
-
-        // Allocate pyramids
-        old_pyramid.allocate();
-        new_pyramid.allocate();
-
-        ARM_COMPUTE_EXPECT(!old_image.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!new_image.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        for(size_t i = 0; i < pyramid_info.num_levels(); ++i)
-        {
-            ARM_COMPUTE_EXPECT(!old_pyramid.get_pyramid_level(i)->info()->is_resizable(), framework::LogLevel::ERRORS);
-            ARM_COMPUTE_EXPECT(!new_pyramid.get_pyramid_level(i)->info()->is_resizable(), framework::LogLevel::ERRORS);
-        }
-
-        // Fill tensors
-        fill(AccessorType(old_image), old_image_name, format);
-        fill(AccessorType(new_image), new_image_name, format);
-
-        // Compute functions
-        old_gp.run();
-        new_gp.run();
-        optical_flow.run();
-
-        return new_points;
-    }
-
-    std::vector<KeyPoint> compute_reference(std::string old_image_name, std::string new_image_name,
-                                            OpticalFlowParameters params, size_t num_levels,
-                                            std::vector<KeyPoint> &old_keypoints, std::vector<KeyPoint> &new_keypoints_estimates,
-                                            Format format, BorderMode border_mode, uint8_t constant_border_value)
-    {
-        SimpleTensor<T> old_image{ library->get_image_shape(old_image_name), data_type_from_format(format) };
-        SimpleTensor<T> new_image{ library->get_image_shape(new_image_name), data_type_from_format(format) };
-
-        fill(old_image, old_image_name, format);
-        fill(new_image, new_image_name, format);
-
-        return reference::optical_flow<T>(old_image, new_image, params, num_levels, old_keypoints, new_keypoints_estimates,
-                                          border_mode, constant_border_value);
-    }
-
-    ArrayType             _target{};
-    std::vector<KeyPoint> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_OPTICAL_FLOW */

diff --git a/tests/validation/fixtures/PhaseFixture.h b/tests/validation/fixtures/PhaseFixture.h
deleted file mode 100644
index b80d1ae..0000000
--- a/tests/validation/fixtures/PhaseFixture.h
+++ /dev/null

@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_PHASE_FIXTURE
-#define ARM_COMPUTE_TEST_PHASE_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/Phase.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class PhaseValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, Format format, PhaseType phase_type)
-    {
-        _target    = compute_target(shape, format, phase_type);
-        _reference = compute_reference(shape, format, phase_type);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor, std::random_device::result_type seed_offset)
-    {
-        library->fill_tensor_uniform(tensor, seed_offset);
-    }
-
-    TensorType compute_target(const TensorShape &shape, Format format, PhaseType phase_type)
-    {
-        DataType data_type = data_type_from_format(format);
-
-        // Create tensors
-        TensorType src1 = create_tensor<TensorType>(shape, data_type);
-        src1.info()->set_format(format);
-
-        TensorType src2 = create_tensor<TensorType>(shape, data_type);
-        src2.info()->set_format(format);
-
-        TensorType dst = create_tensor<TensorType>(shape, DataType::U8);
-        dst.info()->set_format(Format::U8);
-
-        // Create and configure function
-        FunctionType phase;
-
-        phase.configure(&src1, &src2, &dst, phase_type);
-
-        ARM_COMPUTE_EXPECT(src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src1.allocator()->allocate();
-        src2.allocator()->allocate();
-        dst.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src1), 0);
-        fill(AccessorType(src2), 1);
-
-        // Compute function
-        phase.run();
-
-        return dst;
-    }
-
-    SimpleTensor<uint8_t> compute_reference(const TensorShape &shape, Format format, PhaseType phase_type)
-    {
-        DataType data_type = data_type_from_format(format);
-
-        // Create reference
-        SimpleTensor<T> src1{ shape, data_type };
-        SimpleTensor<T> src2{ shape, data_type };
-
-        // Fill reference
-        fill(src1, 0);
-        fill(src2, 1);
-
-        return reference::phase<T>(src1, src2, phase_type);
-    }
-
-    TensorType            _target{};
-    SimpleTensor<uint8_t> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_PHASE_FIXTURE */

diff --git a/tests/validation/fixtures/SobelFixture.h b/tests/validation/fixtures/SobelFixture.h
deleted file mode 100644
index 61a6a80..0000000
--- a/tests/validation/fixtures/SobelFixture.h
+++ /dev/null

@@ -1,192 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_SOBEL_FIXTURE
-#define ARM_COMPUTE_TEST_SOBEL_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/Sobel.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-class CLSobel3x3;
-class CLSobel5x5;
-class CLSobel7x7;
-class NESobel3x3;
-class NESobel5x5;
-class NESobel7x7;
-
-namespace test
-{
-namespace validation
-{
-namespace
-{
-template <typename Function>
-struct info;
-
-template <>
-struct info<NESobel3x3>
-{
-    static const Format dst_format  = Format::S16;
-    static const int    filter_size = 3;
-};
-
-template <>
-struct info<CLSobel3x3>
-{
-    static const Format dst_format  = Format::S16;
-    static const int    filter_size = 3;
-};
-
-template <>
-struct info<NESobel5x5>
-{
-    static const Format dst_format  = Format::S16;
-    static const int    filter_size = 5;
-};
-
-template <>
-struct info<CLSobel5x5>
-{
-    static const Format dst_format  = Format::S16;
-    static const int    filter_size = 5;
-};
-
-template <>
-struct info<NESobel7x7>
-{
-    static const Format dst_format  = Format::S32;
-    static const int    filter_size = 7;
-};
-
-template <>
-struct info<CLSobel7x7>
-{
-    static const Format dst_format  = Format::S32;
-    static const int    filter_size = 7;
-};
-} // namespace
-
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T, typename U>
-class SobelValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, BorderMode border_mode, Format format, GradientDimension gradient_dimension)
-    {
-        // Generate a random constant value
-        std::mt19937                           gen(library->seed());
-        std::uniform_int_distribution<uint8_t> int_dist(0, 255);
-        const uint8_t                          constant_border_value = int_dist(gen);
-
-        _border_mode = border_mode;
-        _target      = compute_target(shape, border_mode, format, constant_border_value, gradient_dimension);
-        _reference   = compute_reference(shape, info<FunctionType>::filter_size, border_mode, format, constant_border_value, gradient_dimension);
-    }
-
-protected:
-    template <typename V>
-    void fill(V &&tensor)
-    {
-        library->fill_tensor_uniform(tensor, 0);
-    }
-
-    std::pair<TensorType, TensorType> compute_target(const TensorShape &shape, BorderMode border_mode, Format format, uint8_t constant_border_value, GradientDimension gradient_dimension)
-    {
-        // Create tensors
-        TensorType src   = create_tensor<TensorType>(shape, data_type_from_format(format));
-        TensorType dst_x = create_tensor<TensorType>(shape, data_type_from_format(info<FunctionType>::dst_format));
-        TensorType dst_y = create_tensor<TensorType>(shape, data_type_from_format(info<FunctionType>::dst_format));
-
-        src.info()->set_format(format);
-        dst_x.info()->set_format(info<FunctionType>::dst_format);
-        dst_y.info()->set_format(info<FunctionType>::dst_format);
-
-        FunctionType sobel;
-
-        switch(gradient_dimension)
-        {
-            case GradientDimension::GRAD_X:
-                sobel.configure(&src, &dst_x, nullptr, border_mode, constant_border_value);
-                break;
-            case GradientDimension::GRAD_Y:
-                sobel.configure(&src, nullptr, &dst_y, border_mode, constant_border_value);
-                break;
-            case GradientDimension::GRAD_XY:
-                sobel.configure(&src, &dst_x, &dst_y, border_mode, constant_border_value);
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Gradient dimension not supported");
-        }
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst_x.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst_y.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        dst_x.allocator()->allocate();
-        dst_y.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst_x.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst_y.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src));
-
-        // Compute function
-        sobel.run();
-
-        return std::make_pair(std::move(dst_x), std::move(dst_y));
-    }
-
-    std::pair<SimpleTensor<U>, SimpleTensor<U>> compute_reference(const TensorShape &shape, int filter_size, BorderMode border_mode, Format format, uint8_t constant_border_value,
-                                                                  GradientDimension gradient_dimension)
-    {
-        // Create reference
-        SimpleTensor<T> src{ shape, format };
-
-        // Fill reference
-        fill(src);
-
-        return reference::sobel<U>(src, filter_size, border_mode, constant_border_value, gradient_dimension);
-    }
-
-    BorderMode _border_mode{ BorderMode::UNDEFINED };
-    std::pair<TensorType, TensorType>           _target{};
-    std::pair<SimpleTensor<U>, SimpleTensor<U>> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_SOBEL_FIXTURE */

diff --git a/tests/validation/fixtures/TableLookupFixture.h b/tests/validation/fixtures/TableLookupFixture.h
deleted file mode 100644
index a50c9fb..0000000
--- a/tests/validation/fixtures/TableLookupFixture.h
+++ /dev/null

@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_TABLE_LOOKUP_FIXTURE
-#define ARM_COMPUTE_TEST_TABLE_LOOKUP_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/RawLutAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/Helpers.h"
-#include "tests/validation/reference/TableLookup.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename LutAccessorType, typename LutType, typename T>
-class TableLookupValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, DataType data_type)
-    {
-        _target    = compute_target(shape, data_type);
-        _reference = compute_reference(shape, data_type);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor, int i)
-    {
-        library->fill_tensor_uniform(tensor, i);
-    }
-
-    TensorType compute_target(const TensorShape &shape, DataType data_type)
-    {
-        // Create Lut
-        const int num_elem = (data_type == DataType::U8) ? std::numeric_limits<uint8_t>::max() + 1 : std::numeric_limits<int16_t>::max() - std::numeric_limits<int16_t>::lowest() + 1;
-        LutType   lut(num_elem, data_type);
-
-        //Fill the Lut
-        fill_lookuptable(LutAccessorType(lut));
-
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, data_type);
-        TensorType dst = create_tensor<TensorType>(shape, data_type);
-
-        // Create and configure function
-        FunctionType table_lookup;
-        table_lookup.configure(&src, &lut, &dst);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        dst.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src), 0);
-        fill(AccessorType(dst), 1);
-
-        // Compute function
-        table_lookup.run();
-
-        return dst;
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape &shape, DataType data_type)
-    {
-        // Create rawLut
-        std::map<T, T> rawlut;
-
-        // Fill the Lut
-        fill_lookuptable(RawLutAccessor<T>(rawlut));
-
-        // Create reference
-        SimpleTensor<T> src{ shape, data_type };
-
-        // Fill reference
-        fill(src, 0);
-
-        return reference::table_lookup(src, rawlut);
-    }
-
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_TABLE_LOOKUP_FIXTURE */

diff --git a/tests/validation/fixtures/ThresholdFixture.h b/tests/validation/fixtures/ThresholdFixture.h
deleted file mode 100644
index 038c296..0000000
--- a/tests/validation/fixtures/ThresholdFixture.h
+++ /dev/null

@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_THRESHOLD_FIXTURE
-#define ARM_COMPUTE_TEST_THRESHOLD_FIXTURE
-
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/Threshold.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class ThresholdValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper, DataType data_type)
-    {
-        _target    = compute_target(shape, data_type, threshold, false_value, true_value, type, upper);
-        _reference = compute_reference(shape, data_type, threshold, false_value, true_value, type, upper);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor)
-    {
-        library->fill_tensor_uniform(tensor, 0);
-    }
-
-    TensorType compute_target(const TensorShape &shape, DataType data_type,
-                              uint8_t threshold, uint8_t false_value, uint8_t true_value,
-                              ThresholdType type, uint8_t upper)
-    {
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, data_type);
-        TensorType dst = create_tensor<TensorType>(shape, data_type);
-
-        // Create and configure function
-        FunctionType thrsh;
-        thrsh.configure(&src, &dst, ThresholdKernelInfo(threshold, false_value, true_value, type, upper));
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        dst.allocator()->allocate();
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src));
-
-        // Compute function
-        thrsh.run();
-
-        return dst;
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape &shape, DataType data_type,
-                                      uint8_t threshold, uint8_t false_value, uint8_t true_value,
-                                      ThresholdType type, uint8_t upper)
-    {
-        // Create reference
-        SimpleTensor<T> src{ shape, data_type };
-
-        // Fill reference
-        fill(src);
-
-        return reference::threshold<T>(src, threshold, false_value, true_value, type, upper);
-    }
-
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_THRESHOLD_FIXTURE */

diff --git a/tests/validation/fixtures/WarpAffineFixture.h b/tests/validation/fixtures/WarpAffineFixture.h
deleted file mode 100644
index 014d662..0000000
--- a/tests/validation/fixtures/WarpAffineFixture.h
+++ /dev/null

@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_WARP_AFFINE_FIXTURE
-#define ARM_COMPUTE_TEST_WARP_AFFINE_FIXTURE
-
-#include <memory>
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/Utils.h"
-#include "tests/validation/reference/WarpAffine.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class WarpAffineValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, DataType data_type, InterpolationPolicy policy, BorderMode border_mode)
-    {
-        // Generate a random constant value if border_mode is constant
-        std::mt19937                           gen(library->seed());
-        std::uniform_int_distribution<uint8_t> distribution_u8(0, 255);
-        uint8_t                                constant_border_value = distribution_u8(gen);
-
-        // Create the matrix
-        std::array<float, 9> matrix{ {} };
-        fill_warp_matrix<9>(matrix);
-
-        _target    = compute_target(shape, data_type, matrix, policy, border_mode, constant_border_value);
-        _reference = compute_reference(shape, data_type, matrix, policy, border_mode, constant_border_value);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor)
-    {
-        library->fill_tensor_uniform(tensor, 0);
-    }
-
-    TensorType compute_target(const TensorShape &shape, DataType data_type, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
-    {
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, data_type);
-        TensorType dst = create_tensor<TensorType>(shape, data_type);
-
-        // Create and configure function
-        FunctionType warp_affine;
-        warp_affine.configure(&src, &dst, matrix, policy, border_mode, constant_border_value);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        dst.allocator()->allocate();
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src));
-
-        // Compute function
-        warp_affine.run();
-
-        return dst;
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape &shape, DataType data_type, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
-    {
-        // Create reference
-        SimpleTensor<T> src{ shape, data_type };
-
-        // Create the valid mask Tensor
-        _valid_mask = SimpleTensor<T>(shape, data_type);
-
-        // Fill reference
-        fill(src);
-
-        return reference::warp_affine<T>(src, _valid_mask, matrix.data(), policy, border_mode, constant_border_value);
-    }
-
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-    SimpleTensor<T> _valid_mask{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_WARP_AFFINE_FIXTURE */

diff --git a/tests/validation/fixtures/WarpPerspectiveFixture.h b/tests/validation/fixtures/WarpPerspectiveFixture.h
deleted file mode 100644
index 40ae3b9..0000000
--- a/tests/validation/fixtures/WarpPerspectiveFixture.h
+++ /dev/null

@@ -1,134 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_WARP_PERSPECTIVE_FIXTURE
-#define ARM_COMPUTE_TEST_WARP_PERSPECTIVE_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/Utils.h"
-#include "tests/validation/reference/WarpPerspective.h"
-
-#include <random>
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class WarpPerspectiveValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape input_shape, DataType data_type, InterpolationPolicy policy, BorderMode border_mode)
-    {
-        uint8_t constant_border_value = 0;
-        // Generate a random constant value if border_mode is constant
-        if(border_mode == BorderMode::CONSTANT)
-        {
-            std::mt19937                           gen(library->seed());
-            std::uniform_int_distribution<uint8_t> distribution_u8(0, 255);
-            constant_border_value = distribution_u8(gen);
-        }
-
-        // Create the matrix
-        std::array<float, 9> matrix = { { 0 } };
-        fill_warp_matrix<9>(matrix);
-
-        _target    = compute_target(input_shape, matrix, policy, border_mode, constant_border_value, data_type);
-        _reference = compute_reference(input_shape, matrix, policy, border_mode, constant_border_value, data_type);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor)
-    {
-        library->fill_tensor_uniform(tensor, 0);
-    }
-
-    TensorType compute_target(const TensorShape &shape, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode,
-                              uint8_t  constant_border_value,
-                              DataType data_type)
-    {
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, data_type);
-        TensorType dst = create_tensor<TensorType>(shape, data_type);
-
-        // Create and configure function
-        FunctionType warp_perspective;
-        warp_perspective.configure(&src, &dst, matrix, policy, border_mode, constant_border_value);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        dst.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src));
-
-        // Compute function
-        warp_perspective.run();
-
-        return dst;
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape &shape, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode,
-                                      uint8_t  constant_border_value,
-                                      DataType data_type)
-    {
-        ARM_COMPUTE_ERROR_ON(data_type != DataType::U8);
-
-        // Create reference
-        SimpleTensor<T> src{ shape, data_type };
-
-        // Create the valid mask Tensor
-        _valid_mask = SimpleTensor<T>(shape, data_type);
-
-        // Fill reference
-        fill(src);
-
-        // Compute reference
-        return reference::warp_perspective<T>(src, _valid_mask, matrix.data(), policy, border_mode, constant_border_value);
-    }
-
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-    BorderMode      _border_mode{};
-    SimpleTensor<T> _valid_mask{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_WARP_PERSPECTIVE_FIXTURE */

diff --git a/tests/validation/reference/Convolution.cpp b/tests/validation/reference/Convolution.cpp
deleted file mode 100644
index 0a4e043..0000000
--- a/tests/validation/reference/Convolution.cpp
+++ /dev/null

@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-
-#include "Convolution.h"
-#include "Utils.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> convolution(const SimpleTensor<uint8_t> &src, DataType output_data_type, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value,
-                            const unsigned int width,
-                            const unsigned int height)
-{
-    ARM_COMPUTE_ERROR_ON(scale == 0);
-    ARM_COMPUTE_ERROR_ON(scale >= static_cast<unsigned int>(std::numeric_limits<int32_t>::max()));
-
-    SimpleTensor<T>       dst(src.shape(), output_data_type);
-    SimpleTensor<int32_t> sum(src.shape(), output_data_type);
-    const uint32_t        num_elements = src.num_elements();
-#if defined(_OPENMP)
-    #pragma omp parallel for
-#endif /* _OPENMP */
-    for(uint32_t element_idx = 0; element_idx < num_elements; ++element_idx)
-    {
-        const Coordinates id = index2coord(src.shape(), element_idx);
-        apply_2d_spatial_filter(id, src, sum, TensorShape(width, height), conv, 1, border_mode, constant_border_value);
-        dst[element_idx] = saturate_cast<T>(tensor_elem_at<int32_t>(sum, id, border_mode, constant_border_value) / static_cast<int>(scale));
-    }
-
-    return dst;
-}
-
-template SimpleTensor<uint8_t> convolution(const SimpleTensor<uint8_t> &src, DataType output_data_type, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value,
-                                           const unsigned int widht, const unsigned int height);
-template SimpleTensor<int16_t> convolution(const SimpleTensor<uint8_t> &src, DataType output_data_type, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value,
-                                           const unsigned int widht, const unsigned int height);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute

diff --git a/tests/validation/reference/Convolution.h b/tests/validation/reference/Convolution.h
deleted file mode 100644
index 174ce7e..0000000
--- a/tests/validation/reference/Convolution.h
+++ /dev/null

@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_CONVOLUTION_H
-#define ARM_COMPUTE_TEST_CONVOLUTION_H
-
-#include "tests/SimpleTensor.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> convolution(const SimpleTensor<uint8_t> &src, DataType output_data_type, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value,
-                            const unsigned int width,
-                            const unsigned int height);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_CONVOLUTION_H */
commit	473cb01e84cef6cab057e9492bfa3b68f708e5d7	[log] [tgz]
author	Michalis Spyrou <michalis.spyrou@arm.com>	Tue Feb 23 11:48:12 2021 +0000
committer	Michalis Spyrou <michalis.spyrou@arm.com>	Wed Mar 03 15:04:20 2021 +0000
tree	a500b8a8afe6a0442e1a54fb8d52c77d22543bcb
parent	f466d75f85938b96dd14675ec091193bdce12122 [diff]