Remove Compute Vision CL support

Resolves COMPMID-4151

Change-Id: I46f541efe8c4087f27794d2e158b6c1547d459ba
Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5160
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 75f76ea..14d3a2c 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -177,10 +177,6 @@
 using namespace arm_compute;
 const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
 {
-    { "absdiff", "absdiff.cl" },
-    { "accumulate", "accumulate.cl" },
-    { "accumulate_squared", "accumulate.cl" },
-    { "accumulate_weighted", "accumulate.cl" },
     { "activation_layer", "activation_layer.cl" },
     { "activation_layer_quant", "activation_layer_quant.cl" },
     { "activation_layer_quant_f32", "activation_layer_quant.cl" },
@@ -200,21 +196,8 @@
     { "bitwise_not", "bitwise_op.cl" },
     { "bounding_box_transform", "bounding_box_transform.cl" },
     { "bounding_box_transform_quantized", "bounding_box_transform_quantized.cl" },
-    { "channel_combine_NV", "channel_combine.cl" },
-    { "channel_combine_RGB888", "channel_combine.cl" },
-    { "channel_combine_RGBA8888", "channel_combine.cl" },
-    { "channel_combine_UYVY422", "channel_combine.cl" },
-    { "channel_combine_YUYV422", "channel_combine.cl" },
     { "channel_shuffle_nchw", "channel_shuffle.cl" },
     { "channel_shuffle_nhwc", "channel_shuffle.cl" },
-    { "channel_extract_NV12", "channel_extract.cl" },
-    { "channel_extract_NV21", "channel_extract.cl" },
-    { "channel_extract_RGB888", "channel_extract.cl" },
-    { "channel_extract_RGBA8888", "channel_extract.cl" },
-    { "channel_extract_UYVY422", "channel_extract.cl" },
-    { "channel_extract_YUYV422", "channel_extract.cl" },
-    { "combine_gradients_L1", "canny.cl" },
-    { "combine_gradients_L2", "canny.cl" },
     { "compare_equal", "comparisons.cl" },
     { "compare_equal_quantized", "comparisons.cl" },
     { "compare_notequal", "comparisons.cl" },
@@ -232,25 +215,11 @@
     { "concatenate_height", "concatenate.cl" },
     { "concatenate_width_x2", "concatenate.cl" },
     { "concatenate_width_x4", "concatenate.cl" },
-    { "convolution_rectangle", "convolution_rectangle.cl" },
     { "col2im", "col2im.cl" },
     { "convert_depth_down", "depth_convert.cl" },
     { "convert_depth_up", "depth_convert.cl" },
     { "convert_fc_weights", "convert_fc_weights.cl" },
-    { "convolution3x3_static", "convolution3x3.cl" },
-    { "convolution5x5_static", "convolution5x5.cl" },
-    { "convolution7x7_static", "convolution7x7.cl" },
-    { "convolution9x9_static", "convolution9x9.cl" },
-    { "convolution_separable1x5_static", "convolution5x5.cl" },
-    { "convolution_separable5x1_static", "convolution5x5.cl" },
-    { "convolution_separable1x7_static", "convolution7x7.cl" },
-    { "convolution_separable7x1_static", "convolution7x7.cl" },
-    { "convolution_separable1x9_static", "convolution9x9.cl" },
-    { "convolution_separable9x1_static", "convolution9x9.cl" },
     { "copy_tensor", "copy_tensor.cl" },
-    { "copy_plane", "channel_extract.cl" },
-    { "copy_planes_3p", "channel_combine.cl" },
-    { "copy_to_keypoint", "fast_corners.cl" },
     { "crop_tensor", "crop_tensor.cl" },
     { "deconvolution_reshape", "deconvolution_layer.cl" },
     { "deconvolution_upsample", "deconvolution_layer.cl" },
@@ -275,8 +244,6 @@
     { "dequantization_layer", "dequantization_layer.cl" },
     { "dequantization_layer_per_channel_nhwc", "dequantization_layer.cl" },
     { "dequantization_layer_per_channel_nchw", "dequantization_layer.cl" },
-    { "derivative", "derivative.cl" },
-    { "dilate", "dilate.cl" },
     { "direct_convolution_nhwc", "direct_convolution.cl" },
     { "direct_convolution1x1", "direct_convolution1x1.cl" },
     { "direct_convolution1x1_f32_bifrost", "direct_convolution1x1.cl" },
@@ -303,8 +270,6 @@
     { "elementwise_operation_SQUARED_DIFF_quantized", "elementwise_operation_quantized.cl" },
     { "elementwise_operation_PRELU_quantized", "elementwise_operation_quantized.cl" },
     { "elementwise_unary", "elementwise_unary.cl" },
-    { "erode", "erode.cl" },
-    { "fast_corners", "fast_corners.cl" },
     { "fft_digit_reverse_axis_0", "fft_digit_reverse.cl" },
     { "fft_digit_reverse_axis_1", "fft_digit_reverse.cl" },
     { "fft_radix_2_first_stage_axis_0", "fft.cl" },
@@ -334,12 +299,9 @@
     { "fft_scale_conj", "fft_scale.cl" },
     { "fill_image_borders_constant", "fill_border.cl" },
     { "fill_image_borders_replicate", "fill_border.cl" },
-    { "finalize", "optical_flow_pyramid_lk.cl" },
     { "floor_layer", "floor.cl" },
     { "fuse_batchnormalization_layer", "batchnormalization_layer.cl" },
     { "gather", "gather.cl" },
-    { "gaussian1x5_sub_x", "gaussian_pyramid.cl" },
-    { "gaussian5x1_sub_y", "gaussian_pyramid.cl" },
     { "gemm_ma_f16", "gemm.cl" },
     { "gemm_ma_f32", "gemm.cl" },
     { "gemm_mv", "gemv.cl" },
@@ -384,17 +346,6 @@
     { "gemmlowp_output_stage_quantize_down_float", "gemmlowp.cl" },
     { "generate_proposals_compute_all_anchors", "generate_proposals.cl" },
     { "generate_proposals_compute_all_anchors_quantized", "generate_proposals_quantized.cl" },
-    { "harris_score_3x3", "harris_corners.cl" },
-    { "harris_score_5x5", "harris_corners.cl" },
-    { "harris_score_7x7", "harris_corners.cl" },
-    { "hist_border_kernel", "histogram.cl" },
-    { "hist_border_kernel_fixed", "histogram.cl" },
-    { "hist_local_kernel", "histogram.cl" },
-    { "hist_local_kernel_fixed", "histogram.cl" },
-    { "hog_block_normalization", "hog.cl" },
-    { "hog_detector", "hog.cl" },
-    { "hog_orientation_binning", "hog.cl" },
-    { "hysteresis", "canny.cl" },
     { "im2col1x1_stridex1_nchw", "im2col.cl" },
     { "im2col3x3_nchw", "im2col.cl" },
     { "im2col5x5_nchw", "im2col.cl" },
@@ -404,36 +355,14 @@
     { "im2col3x3_nhwc", "im2col.cl" },
     { "im2col9x9_nhwc", "im2col.cl" },
     { "im2col_generic_nhwc", "im2col.cl" },
-    { "init_level", "optical_flow_pyramid_lk.cl" },
-    { "init_level_max", "optical_flow_pyramid_lk.cl" },
-    { "init_level_max_initial_estimate", "optical_flow_pyramid_lk.cl" },
     { "instance_normalization", "instance_normalization.cl" },
-    { "integral_horizontal", "integral_image.cl" },
-    { "integral_vertical", "integral_image.cl" },
-    { "IYUV_to_NV12_bt709", "color_convert.cl" },
-    { "IYUV_to_RGB888_bt709", "color_convert.cl" },
-    { "IYUV_to_RGBA8888_bt709", "color_convert.cl" },
-    { "IYUV_to_YUV444_bt709", "color_convert.cl" },
     { "l2_normalize_x", "l2_normalize.cl" },
     { "l2_normalize_y", "l2_normalize.cl" },
     { "l2_normalize_z", "l2_normalize.cl" },
-    { "lktracker_stage0", "optical_flow_pyramid_lk.cl" },
-    { "lktracker_stage1", "optical_flow_pyramid_lk.cl" },
-    { "magnitude_phase", "magnitude_phase.cl" },
     { "max_unpooling_layer_2", "unpooling_layer.cl" },
-    { "mean_stddev_accumulate", "mean_stddev.cl" },
     { "mean_stddev_normalization", "mean_stddev_normalization.cl" },
     { "memset", "memset.cl" },
-    { "minmax", "minmaxloc.cl" },
-    { "minmax_border", "minmaxloc.cl" },
     { "minmax_layer", "minmax_layer.cl" },
-    { "minmaxloc", "minmaxloc.cl" },
-    { "non_linear_filter_box3x3", "non_linear_filter3x3.cl" },
-    { "non_linear_filter_cross3x3", "non_linear_filter3x3.cl" },
-    { "non_linear_filter_disk3x3", "non_linear_filter3x3.cl" },
-    { "non_linear_filter_box5x5", "non_linear_filter5x5.cl" },
-    { "non_linear_filter_cross5x5", "non_linear_filter5x5.cl" },
-    { "non_linear_filter_disk5x5", "non_linear_filter5x5.cl" },
     { "non_max_suppression", "nonmax.cl" },
     { "normalization_layer_cross_map", "normalization_layer.cl" },
     { "normalization_layer_in_map_nchw", "normalization_layer.cl" },
@@ -442,14 +371,6 @@
     { "normalize_planar_yuv_layer_nhwc", "normalize_planar_yuv_layer.cl" },
     { "normalize_planar_yuv_layer_q8_nchw", "normalize_planar_yuv_layer_quantized.cl" },
     { "normalize_planar_yuv_layer_q8_nhwc", "normalize_planar_yuv_layer_quantized.cl" },
-    { "NV12_to_IYUV_bt709", "color_convert.cl" },
-    { "NV12_to_RGB888_bt709", "color_convert.cl" },
-    { "NV12_to_RGBA8888_bt709", "color_convert.cl" },
-    { "NV12_to_YUV444_bt709", "color_convert.cl" },
-    { "NV21_to_IYUV_bt709", "color_convert.cl" },
-    { "NV21_to_RGB888_bt709", "color_convert.cl" },
-    { "NV21_to_RGBA8888_bt709", "color_convert.cl" },
-    { "NV21_to_YUV444_bt709", "color_convert.cl" },
     { "pad_layer_constant", "pad_layer.cl" },
     { "pad_layer_symmetric_reflect", "pad_layer.cl" },
     { "permute", "permute.cl" },
@@ -485,15 +406,6 @@
     { "reshape_layer", "reshape_layer.cl" },
     { "reshape_to_columns", "convolution_layer.cl" },
     { "reverse", "reverse.cl" },
-    { "RGB888_to_IYUV_bt709", "color_convert.cl" },
-    { "RGB888_to_NV12_bt709", "color_convert.cl" },
-    { "RGB888_to_RGBA8888_bt709", "color_convert.cl" },
-    { "RGB888_to_U8_bt709", "color_convert.cl" },
-    { "RGB888_to_YUV444_bt709", "color_convert.cl" },
-    { "RGBA8888_to_IYUV_bt709", "color_convert.cl" },
-    { "RGBA8888_to_NV12_bt709", "color_convert.cl" },
-    { "RGBA8888_to_RGB888_bt709", "color_convert.cl" },
-    { "RGBA8888_to_YUV444_bt709", "color_convert.cl" },
     { "roi_align_layer", "roi_align_layer.cl" },
     { "roi_align_layer_quantized", "roi_align_layer_quantized.cl" },
     { "roi_pooling_layer", "roi_pooling_layer.cl" },
@@ -503,15 +415,9 @@
     { "scale_bilinear_nhwc", "scale.cl" },
     { "scale_bilinear_quantized_nchw", "scale_quantized.cl" },
     { "scale_bilinear_quantized_nhwc", "scale_quantized.cl" },
-    { "scharr3x3", "scharr_filter.cl" },
     { "select_same_rank", "select.cl" },
     { "select_different_rank_2", "select.cl" },
     { "select_different_rank_n", "select.cl" },
-    { "sobel3x3", "sobel_filter.cl" },
-    { "sobel_separable5x1", "sobel_filter.cl" },
-    { "sobel_separable1x5", "sobel_filter.cl" },
-    { "sobel_separable7x1", "sobel_filter.cl" },
-    { "sobel_separable1x7", "sobel_filter.cl" },
     { "softmax_layer_norm", "softmax_layer.cl" },
     { "softmax_layer_norm_quantized", "softmax_layer_quantized.cl" },
     { "softmax_layer_max_shift_exp_sum_quantized_serial", "softmax_layer_quantized.cl" },
@@ -526,23 +432,10 @@
     { "softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl" },
     { "stack_layer", "stack_layer.cl" },
     { "strided_slice", "slice_ops.cl" },
-    { "suppress_non_maximum", "canny.cl" },
-    { "tablelookup_U8", "tablelookup.cl" },
-    { "tablelookup_S16", "tablelookup.cl" },
-    { "threshold_binary", "threshold.cl" },
-    { "threshold_range", "threshold.cl" },
     { "tile", "tile.cl" },
     { "transpose", "transpose.cl" },
-    { "UYVY422_to_IYUV_bt709", "color_convert.cl" },
-    { "UYVY422_to_NV12_bt709", "color_convert.cl" },
-    { "UYVY422_to_RGB888_bt709", "color_convert.cl" },
-    { "UYVY422_to_RGBA8888_bt709", "color_convert.cl" },
     { "upsample_layer_nchw", "upsample_layer.cl" },
     { "upsample_layer_nhwc", "upsample_layer.cl" },
-    { "warp_affine_nearest_neighbour", "warp_affine.cl" },
-    { "warp_affine_bilinear", "warp_affine.cl" },
-    { "warp_perspective_nearest_neighbour", "warp_perspective.cl" },
-    { "warp_perspective_bilinear", "warp_perspective.cl" },
     { "winograd_filter_transform_2x2_3x3_nchw", "winograd_filter_transform.cl" },
     { "winograd_filter_transform_2x1_3x1_nchw", "winograd_filter_transform.cl" },
     { "winograd_filter_transform_1x2_1x3_nchw", "winograd_filter_transform.cl" },
@@ -602,24 +495,12 @@
     { "winograd_output_transform_1x2_1x7_nhwc", "winograd_output_transform.cl" },
     { "yolo_layer_nchw", "yolo_layer.cl" },
     { "yolo_layer_nhwc", "yolo_layer.cl" },
-    { "YUYV422_to_IYUV_bt709", "color_convert.cl" },
-    { "YUYV422_to_NV12_bt709", "color_convert.cl" },
-    { "YUYV422_to_RGB888_bt709", "color_convert.cl" },
-    { "YUYV422_to_RGBA8888_bt709", "color_convert.cl" },
 };
 
 const std::map<std::string, std::string> CLKernelLibrary::_program_source_map =
 {
 #ifdef EMBEDDED_KERNELS
     {
-        "absdiff.cl",
-#include "./cl_kernels/absdiff.clembed"
-    },
-    {
-        "accumulate.cl",
-#include "./cl_kernels/accumulate.clembed"
-    },
-    {
         "activation_layer.cl",
 #include "./cl_kernels/activation_layer.clembed"
     },
@@ -648,18 +529,6 @@
 #include "./cl_kernels/bounding_box_transform_quantized.clembed"
     },
     {
-        "canny.cl",
-#include "./cl_kernels/canny.clembed"
-    },
-    {
-        "channel_combine.cl",
-#include "./cl_kernels/channel_combine.clembed"
-    },
-    {
-        "channel_extract.cl",
-#include "./cl_kernels/channel_extract.clembed"
-    },
-    {
         "channel_shuffle.cl",
 #include "./cl_kernels/channel_shuffle.clembed"
     },
@@ -676,38 +545,14 @@
 #include "./cl_kernels/concatenate.clembed"
     },
     {
-        "color_convert.cl",
-#include "./cl_kernels/color_convert.clembed"
-    },
-    {
         "convert_fc_weights.cl",
 #include "./cl_kernels/convert_fc_weights.clembed"
-    },
-    {
-        "convolution3x3.cl",
-#include "./cl_kernels/convolution3x3.clembed"
-    },
-    {
-        "convolution5x5.cl",
-#include "./cl_kernels/convolution5x5.clembed"
-    },
-    {
-        "convolution7x7.cl",
-#include "./cl_kernels/convolution7x7.clembed"
-    },
-    {
-        "convolution9x9.cl",
-#include "./cl_kernels/convolution9x9.clembed"
-    },
+    },    
     {
         "convolution_layer.cl",
 #include "./cl_kernels/convolution_layer.clembed"
     },
     {
-        "convolution_rectangle.cl",
-#include "./cl_kernels/convolution_rectangle.clembed"
-    },
-    {
         "copy_tensor.cl",
 #include "./cl_kernels/copy_tensor.clembed"
     },
@@ -744,14 +589,6 @@
 #include "./cl_kernels/dequantization_layer.clembed"
     },
     {
-        "derivative.cl",
-#include "./cl_kernels/derivative.clembed"
-    },
-    {
-        "dilate.cl",
-#include "./cl_kernels/dilate.clembed"
-    },
-    {
         "direct_convolution1x1.cl",
 #include "./cl_kernels/direct_convolution1x1.clembed"
     },
@@ -784,14 +621,6 @@
 #include "./cl_kernels/elementwise_unary.clembed"
     },
     {
-        "erode.cl",
-#include "./cl_kernels/erode.clembed"
-    },
-    {
-        "fast_corners.cl",
-#include "./cl_kernels/fast_corners.clembed"
-    },
-    {
         "fft.cl",
 #include "./cl_kernels/fft.clembed"
     },
@@ -816,10 +645,6 @@
 #include "./cl_kernels/gather.clembed"
     },
     {
-        "gaussian_pyramid.cl",
-#include "./cl_kernels/gaussian_pyramid.clembed"
-    },
-    {
         "gemm.cl",
 #include "./cl_kernels/gemm.clembed"
     },
@@ -844,10 +669,6 @@
 #include "./cl_kernels/generate_proposals_quantized.clembed"
     },
     {
-        "harris_corners.cl",
-#include "./cl_kernels/harris_corners.clembed"
-    },
-    {
         "helpers.h",
 #include "./cl_kernels/helpers.hembed"
     },
@@ -856,14 +677,6 @@
 #include "./cl_kernels/helpers_asymm.hembed"
     },
     {
-        "histogram.cl",
-#include "./cl_kernels/histogram.clembed"
-    },
-    {
-        "hog.cl",
-#include "./cl_kernels/hog.clembed"
-    },
-    {
         "im2col.cl",
 #include "./cl_kernels/im2col.clembed"
     },
@@ -872,22 +685,10 @@
 #include "./cl_kernels/instance_normalization.clembed"
     },
     {
-        "integral_image.cl",
-#include "./cl_kernels/integral_image.clembed"
-    },
-    {
         "l2_normalize.cl",
 #include "./cl_kernels/l2_normalize.clembed"
     },
     {
-        "magnitude_phase.cl",
-#include "./cl_kernels/magnitude_phase.clembed"
-    },
-    {
-        "mean_stddev.cl",
-#include "./cl_kernels/mean_stddev.clembed"
-    },
-    {
         "mean_stddev_normalization.cl",
 #include "./cl_kernels/mean_stddev_normalization.clembed"
     },
@@ -896,26 +697,10 @@
 #include "./cl_kernels/memset.clembed"
     },
     {
-        "minmaxloc.cl",
-#include "./cl_kernels/minmaxloc.clembed"
-    },
-    {
         "minmax_layer.cl",
 #include "./cl_kernels/minmax_layer.clembed"
     },
     {
-        "non_linear_filter3x3.cl",
-#include "./cl_kernels/non_linear_filter3x3.clembed"
-    },
-    {
-        "non_linear_filter5x5.cl",
-#include "./cl_kernels/non_linear_filter5x5.clembed"
-    },
-    {
-        "non_linear_filter_helpers.h",
-#include "./cl_kernels/non_linear_filter_helpers.hembed"
-    },
-    {
         "nonmax.cl",
 #include "./cl_kernels/nonmax.clembed"
     },
@@ -936,10 +721,6 @@
 #include "./cl_kernels/batchnormalization_layer.clembed"
     },
     {
-        "optical_flow_pyramid_lk.cl",
-#include "./cl_kernels/optical_flow_pyramid_lk.clembed"
-    },
-    {
         "pad_layer.cl",
 #include "./cl_kernels/pad_layer.clembed"
     },
@@ -1020,18 +801,10 @@
 #include "./cl_kernels/scale_quantized.clembed"
     },
     {
-        "scharr_filter.cl",
-#include "./cl_kernels/scharr_filter.clembed"
-    },
-    {
         "select.cl",
 #include "./cl_kernels/select.clembed"
     },
     {
-        "sobel_filter.cl",
-#include "./cl_kernels/sobel_filter.clembed"
-    },
-    {
         "softmax_layer.cl",
 #include "./cl_kernels/softmax_layer.clembed"
     },
@@ -1056,14 +829,6 @@
 #include "./cl_kernels/stack_layer.clembed"
     },
     {
-        "tablelookup.cl",
-#include "./cl_kernels/tablelookup.clembed"
-    },
-    {
-        "threshold.cl",
-#include "./cl_kernels/threshold.clembed"
-    },
-    {
         "tile.cl",
 #include "./cl_kernels/tile.clembed"
     },
@@ -1080,18 +845,6 @@
 #include "./cl_kernels/unpooling_layer.clembed"
     },
     {
-        "warp_affine.cl",
-#include "./cl_kernels/warp_affine.clembed"
-    },
-    {
-        "warp_helpers.h",
-#include "./cl_kernels/warp_helpers.hembed"
-    },
-    {
-        "warp_perspective.cl",
-#include "./cl_kernels/warp_perspective.clembed"
-    },
-    {
         "winograd_filter_transform.cl",
 #include "./cl_kernels/winograd_filter_transform.clembed"
     },
diff --git a/src/core/CL/CLKernels.h b/src/core/CL/CLKernels.h
index 7383dce..22c9cd9 100644
--- a/src/core/CL/CLKernels.h
+++ b/src/core/CL/CLKernels.h
@@ -25,23 +25,15 @@
 #define ARM_COMPUTE_CLKERNELS_H
 
 /* Header regrouping all the CL kernels */
-#include "src/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
-#include "src/core/CL/kernels/CLAccumulateKernel.h"
 #include "src/core/CL/kernels/CLArgMinMaxLayerKernel.h"
 #include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
 #include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
 #include "src/core/CL/kernels/CLBitwiseKernel.h"
 #include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
-#include "src/core/CL/kernels/CLBox3x3Kernel.h"
-#include "src/core/CL/kernels/CLCannyEdgeKernel.h"
-#include "src/core/CL/kernels/CLChannelCombineKernel.h"
-#include "src/core/CL/kernels/CLChannelExtractKernel.h"
 #include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h"
 #include "src/core/CL/kernels/CLCol2ImKernel.h"
-#include "src/core/CL/kernels/CLColorConvertKernel.h"
 #include "src/core/CL/kernels/CLComparisonKernel.h"
 #include "src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h"
-#include "src/core/CL/kernels/CLConvolutionKernel.h"
 #include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
 #include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
 #include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
@@ -51,14 +43,10 @@
 #include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
 #include "src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h"
 #include "src/core/CL/kernels/CLDequantizationLayerKernel.h"
-#include "src/core/CL/kernels/CLDerivativeKernel.h"
-#include "src/core/CL/kernels/CLDilateKernel.h"
 #include "src/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
-#include "src/core/CL/kernels/CLErodeKernel.h"
 #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
 #include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
 #include "src/core/CL/kernels/CLFFTScaleKernel.h"
-#include "src/core/CL/kernels/CLFastCornersKernel.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
 #include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
@@ -77,28 +65,14 @@
 #include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
 #include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
 #include "src/core/CL/kernels/CLGatherKernel.h"
-#include "src/core/CL/kernels/CLGaussian3x3Kernel.h"
-#include "src/core/CL/kernels/CLGaussian5x5Kernel.h"
-#include "src/core/CL/kernels/CLGaussianPyramidKernel.h"
 #include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
-#include "src/core/CL/kernels/CLHOGDescriptorKernel.h"
-#include "src/core/CL/kernels/CLHOGDetectorKernel.h"
-#include "src/core/CL/kernels/CLHarrisCornersKernel.h"
-#include "src/core/CL/kernels/CLHistogramKernel.h"
 #include "src/core/CL/kernels/CLIm2ColKernel.h"
 #include "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
-#include "src/core/CL/kernels/CLIntegralImageKernel.h"
 #include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h"
 #include "src/core/CL/kernels/CLLKTrackerKernel.h"
-#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
 #include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
-#include "src/core/CL/kernels/CLMeanStdDevKernel.h"
 #include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
-#include "src/core/CL/kernels/CLMedian3x3Kernel.h"
 #include "src/core/CL/kernels/CLMinMaxLayerKernel.h"
-#include "src/core/CL/kernels/CLMinMaxLocationKernel.h"
-#include "src/core/CL/kernels/CLNonLinearFilterKernel.h"
-#include "src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
 #include "src/core/CL/kernels/CLNormalizationLayerKernel.h"
 #include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
 #include "src/core/CL/kernels/CLPadLayerKernel.h"
@@ -114,22 +88,14 @@
 #include "src/core/CL/kernels/CLReorgLayerKernel.h"
 #include "src/core/CL/kernels/CLReverseKernel.h"
 #include "src/core/CL/kernels/CLScaleKernel.h"
-#include "src/core/CL/kernels/CLScharr3x3Kernel.h"
 #include "src/core/CL/kernels/CLSelectKernel.h"
-#include "src/core/CL/kernels/CLSobel3x3Kernel.h"
-#include "src/core/CL/kernels/CLSobel5x5Kernel.h"
-#include "src/core/CL/kernels/CLSobel7x7Kernel.h"
 #include "src/core/CL/kernels/CLSoftmaxLayerKernel.h"
 #include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
 #include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
 #include "src/core/CL/kernels/CLStackLayerKernel.h"
 #include "src/core/CL/kernels/CLStridedSliceKernel.h"
-#include "src/core/CL/kernels/CLTableLookupKernel.h"
-#include "src/core/CL/kernels/CLThresholdKernel.h"
 #include "src/core/CL/kernels/CLTileKernel.h"
 #include "src/core/CL/kernels/CLTransposeKernel.h"
-#include "src/core/CL/kernels/CLWarpAffineKernel.h"
-#include "src/core/CL/kernels/CLWarpPerspectiveKernel.h"
 #include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
 #include "src/core/CL/kernels/CLWinogradFilterTransformKernel.h"
 #include "src/core/CL/kernels/CLWinogradInputTransformKernel.h"
diff --git a/src/core/CL/cl_kernels/absdiff.cl b/src/core/CL/cl_kernels/absdiff.cl
deleted file mode 100644
index a09caf5..0000000
--- a/src/core/CL/cl_kernels/absdiff.cl
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Calculate the absolute difference of two input images.
- *
- * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:\n
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
- *
- * @param[in]  in1_ptr                           Pointer to the first source image. Supported data types: U8, S16
- * @param[in]  in1_stride_x                      Stride of the first source image in X dimension (in bytes)
- * @param[in]  in1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the first source image in Y dimension (in bytes)
- * @param[in]  in1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the first source image
- * @param[in]  in2_ptr                           Pointer to the second source image. Supported data types: U8, S16
- * @param[in]  in2_stride_x                      Stride of the second source image in X dimension (in bytes)
- * @param[in]  in2_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the second source image in Y dimension (in bytes)
- * @param[in]  in2_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the second source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void absdiff(
-    IMAGE_DECLARATION(in1),
-    IMAGE_DECLARATION(in2),
-    IMAGE_DECLARATION(out))
-{
-    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
-    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
-
-    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
-    in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
-    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
-    in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
-
-    vstore16(CONVERT_SAT(abs_diff(in_a, in_b), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
-}
diff --git a/src/core/CL/cl_kernels/accumulate.cl b/src/core/CL/cl_kernels/accumulate.cl
deleted file mode 100644
index 9e37830..0000000
--- a/src/core/CL/cl_kernels/accumulate.cl
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function accumulates an input image into output image.
- *
- * @param[in]  input_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] accu_ptr                            Pointer to the destination image. Supported data types: S16
- * @param[in]  accu_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  accu_step_x                         accu_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  accu_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  accu_step_y                         accu_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  accu_offset_first_element_in_bytes  The offset of the first element in the destination image
- */
-__kernel void accumulate(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(accu))
-{
-    // Get pixels pointer
-    Image input = CONVERT_TO_IMAGE_STRUCT(input);
-    Image accu  = CONVERT_TO_IMAGE_STRUCT(accu);
-
-    // Load data
-    uchar16 in_data   = vload16(0, input.ptr);
-    short16 accu_data = vload16(0, (__global short *)accu.ptr);
-
-    // Perform accumulation
-    short16 res = add_sat(convert_short16(in_data), accu_data);
-
-    // Store result
-    vstore16(res, 0, (__global short *)accu.ptr);
-}
-
-/** This function accumulates a weighted value from an input image to an output image.
- *
- * @param[in]  input_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] accu_ptr                            Pointer to the destination image. Supported data types: S16
- * @param[in]  accu_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  accu_step_x                         accu_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  accu_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  accu_step_y                         accu_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  accu_offset_first_element_in_bytes  The offset of the first element in the destination image
- * @param[in]  alpha                               The float scalar value with a value in the range of 0 to 1
- */
-__kernel void accumulate_weighted(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(accu),
-    const float alpha)
-{
-    // Get pixels pointer
-    Image input = CONVERT_TO_IMAGE_STRUCT(input);
-    Image accu  = CONVERT_TO_IMAGE_STRUCT(accu);
-
-    // Load data
-    const float16 in_data   = convert_float16(vload16(0, input.ptr));
-    const float16 accu_data = convert_float16(vload16(0, accu.ptr));
-
-    // Calculate weighted accumulation
-    const uchar16 res = convert_uchar16((1.0f - alpha) * accu_data + alpha * in_data);
-
-    // Store result
-    vstore16(res, 0, accu.ptr);
-}
-
-/** This function accumulates a squared value from an input image to an output image.
- *
- * @param[in]  input_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] accu_ptr                            Pointer to the destination image. Supported data types: S16
- * @param[in]  accu_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  accu_step_x                         accu_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  accu_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  accu_step_y                         accu_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  accu_offset_first_element_in_bytes  The offset of the first element in the destination image
- * @param[in]  shift                               The U32 scalar value with a value in the range of 0 to 15
- */
-__kernel void accumulate_squared(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(accu),
-    const uint shift)
-{
-    // Get pixels pointer
-    Image input = CONVERT_TO_IMAGE_STRUCT(input);
-    Image accu  = CONVERT_TO_IMAGE_STRUCT(accu);
-
-    // Load data
-    ushort16 in_data   = convert_ushort16(vload16(0, input.ptr));
-    uint16   accu_data = convert_uint16(vload16(0, (__global short *)accu.ptr));
-
-    // Calculate squared accumulation
-    short16 res = convert_short16_sat(accu_data + convert_uint16((in_data * in_data) >> shift));
-
-    // Store result
-    vstore16(res, 0, (__global short *)accu.ptr);
-}
diff --git a/src/core/CL/cl_kernels/canny.cl b/src/core/CL/cl_kernels/canny.cl
deleted file mode 100644
index bcff843..0000000
--- a/src/core/CL/cl_kernels/canny.cl
+++ /dev/null
@@ -1,454 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Calculate the magnitude and phase from horizontal and vertical result of sobel result.
- *
- * @note The calculation of gradient uses level 1 normalisation.
- * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
- *
- * @param[in]  src1_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
- * @param[in]  src1_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src1_step_x                         src1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src1_step_y                         src1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  src2_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
- * @param[in]  src2_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src2_step_x                         src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src2_step_y                         src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] grad_ptr                            Pointer to the gradient output. Supported data types: U16, U32
- * @param[in]  grad_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  grad_step_x                         grad_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  grad_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  grad_step_y                         grad_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  grad_offset_first_element_in_bytes  The offset of the first element of the output
- * @param[out] angle_ptr                           Pointer to the angle output. Supported data types: U8
- * @param[in]  angle_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  angle_step_x                        angle_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  angle_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  angle_step_y                        angle_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  angle_offset_first_element_in_bytes The offset of the first element of the output
- */
-__kernel void combine_gradients_L1(
-    IMAGE_DECLARATION(src1),
-    IMAGE_DECLARATION(src2),
-    IMAGE_DECLARATION(grad),
-    IMAGE_DECLARATION(angle))
-{
-    // Construct images
-    Image src1  = CONVERT_TO_IMAGE_STRUCT(src1);
-    Image src2  = CONVERT_TO_IMAGE_STRUCT(src2);
-    Image grad  = CONVERT_TO_IMAGE_STRUCT(grad);
-    Image angle = CONVERT_TO_IMAGE_STRUCT(angle);
-
-    // Load sobel horizontal and vertical values
-    VEC_DATA_TYPE(DATA_TYPE_IN, 4)
-    h = vload4(0, (__global DATA_TYPE_IN *)src1.ptr);
-    VEC_DATA_TYPE(DATA_TYPE_IN, 4)
-    v = vload4(0, (__global DATA_TYPE_IN *)src2.ptr);
-
-    /* Calculate the gradient, using level 1 normalisation method */
-    VEC_DATA_TYPE(DATA_TYPE_OUT, 4)
-    m = CONVERT_SAT((abs(h) + abs(v)), VEC_DATA_TYPE(DATA_TYPE_OUT, 4));
-
-    /* Calculate the angle */
-    float4 p = 180.0f * atan2pi(convert_float4(v), convert_float4(h));
-
-    /* Remap angle to range [0, 256) */
-    p = select(p, p + 180.0f, p < 0.0f);
-
-    /* Store results */
-    vstore4(m, 0, (__global DATA_TYPE_OUT *)grad.ptr);
-    vstore4(convert_uchar4_sat_rte(p), 0, angle.ptr);
-}
-
-/** Calculate the gradient and angle from horizontal and vertical result of sobel result.
- *
- * @note The calculation of gradient uses level 2 normalisation
- * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
- *
- * @param[in]  src1_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
- * @param[in]  src1_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src1_step_x                         src1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src1_step_y                         src1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  src2_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
- * @param[in]  src2_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src2_step_x                         src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src2_step_y                         src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] grad_ptr                            Pointer to the gradient output. Supported data types: U16, U32
- * @param[in]  grad_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  grad_step_x                         grad_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  grad_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  grad_step_y                         grad_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  grad_offset_first_element_in_bytes  The offset of the first element of the output
- * @param[out] angle_ptr                           Pointer to the angle output. Supported data types: U8
- * @param[in]  angle_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  angle_step_x                        angle_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  angle_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  angle_step_y                        angle_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  angle_offset_first_element_in_bytes The offset of the first element of the output
- */
-__kernel void combine_gradients_L2(
-    IMAGE_DECLARATION(src1),
-    IMAGE_DECLARATION(src2),
-    IMAGE_DECLARATION(grad),
-    IMAGE_DECLARATION(angle))
-{
-    // Construct images
-    Image src1  = CONVERT_TO_IMAGE_STRUCT(src1);
-    Image src2  = CONVERT_TO_IMAGE_STRUCT(src2);
-    Image grad  = CONVERT_TO_IMAGE_STRUCT(grad);
-    Image angle = CONVERT_TO_IMAGE_STRUCT(angle);
-
-    // Load sobel horizontal and vertical values
-    float4 h = convert_float4(vload4(0, (__global DATA_TYPE_IN *)src1.ptr));
-    float4 v = convert_float4(vload4(0, (__global DATA_TYPE_IN *)src2.ptr));
-
-    /* Calculate the gradient, using level 2 normalisation method */
-    float4 m = sqrt(h * h + v * v);
-
-    /* Calculate the angle */
-    float4 p = 180.0f * atan2pi(v, h);
-
-    /* Remap angle to range [0, 256) */
-    p = select(p, p + 180.0f, p < 0.0f);
-
-    /* Store results */
-    vstore4(CONVERT_SAT_ROUND(m, VEC_DATA_TYPE(DATA_TYPE_OUT, 4), rte), 0, (__global DATA_TYPE_OUT *)grad.ptr);
-    vstore4(convert_uchar4_sat_rte(p), 0, angle.ptr);
-}
-
-#define EDGE 255
-#define NO_EDGE 0
-
-/** Array that holds the relative coordinates offset for the neighbouring pixels.
- */
-__constant short4 neighbours_coords[] =
-{
-    { -1, 0, 1, 0 },  // 0
-    { -1, -1, 1, 1 }, // 45
-    { 0, -1, 0, 1 },  // 90
-    { 1, -1, -1, 1 }, // 135
-};
-
-/** Perform non maximum suppression.
- *
- * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
- *
- * @param[in]  grad_ptr                              Pointer to the gradient output. Supported data types: S16, S32
- * @param[in]  grad_stride_x                         Stride of the source image in X dimension (in bytes)
- * @param[in]  grad_step_x                           grad_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  grad_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  grad_step_y                           grad_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  grad_offset_first_element_in_bytes    The offset of the first element of the output
- * @param[in]  angle_ptr                             Pointer to the angle output. Supported data types: U8
- * @param[in]  angle_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  angle_step_x                          angle_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  angle_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  angle_step_y                          angle_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  angle_offset_first_element_in_bytes   TThe offset of the first element of the output
- * @param[out] non_max_ptr                           Pointer to the non maximum suppressed output. Supported data types: U16, U32
- * @param[in]  non_max_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  non_max_step_x                        non_max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  non_max_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  non_max_step_y                        non_max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  non_max_offset_first_element_in_bytes The offset of the first element of the output
- * @param[in]  lower_thr                             The low threshold
- */
-__kernel void suppress_non_maximum(
-    IMAGE_DECLARATION(grad),
-    IMAGE_DECLARATION(angle),
-    IMAGE_DECLARATION(non_max),
-    uint lower_thr)
-{
-    // Construct images
-    Image grad    = CONVERT_TO_IMAGE_STRUCT(grad);
-    Image angle   = CONVERT_TO_IMAGE_STRUCT(angle);
-    Image non_max = CONVERT_TO_IMAGE_STRUCT(non_max);
-
-    // Index
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-
-    // Get gradient and angle
-    DATA_TYPE_IN gradient = *((__global DATA_TYPE_IN *)grad.ptr);
-    uchar an              = *((__global uchar *)angle.ptr);
-
-    // Early return if not greater than lower threshold
-    if(gradient <= lower_thr)
-    {
-        return;
-    }
-
-    // Divide the whole round into 4 directions
-    DATA_TYPE_OUT q_an;
-
-    if(an < 22.5f || an >= 157.5f)
-    {
-        q_an = 0;
-    }
-    else if(an < 67.5f)
-    {
-        q_an = 1;
-    }
-    else if(an < 112.5f)
-    {
-        q_an = 2;
-    }
-    else
-    {
-        q_an = 3;
-    }
-
-    // Find the two pixels in the perpendicular direction
-    short2       x_p = neighbours_coords[q_an].s02;
-    short2       y_p = neighbours_coords[q_an].s13;
-    DATA_TYPE_IN g1  = *((global DATA_TYPE_IN *)offset(&grad, x_p.x, y_p.x));
-    DATA_TYPE_IN g2  = *((global DATA_TYPE_IN *)offset(&grad, x_p.y, y_p.y));
-
-    if((gradient > g1) && (gradient > g2))
-    {
-        __global uchar *non_max_addr            = non_max_ptr + non_max_offset_first_element_in_bytes + x * non_max_stride_x + y * non_max_stride_y;
-        *((global DATA_TYPE_OUT *)non_max_addr) = gradient;
-    }
-}
-
-#define hysteresis_local_stack_L1 8  // The size of level 1 stack. This has to agree with the host side
-#define hysteresis_local_stack_L2 16 // The size of level 2 stack, adjust this can impact the match rate with VX implementation
-
-/** Check whether pixel is valid
- *
- * Skip the pixel if the early_test fails.
- * Otherwise, it tries to add the pixel coordinate to the stack, and proceed to popping the stack instead if the stack is full
- *
- * @param[in] early_test Boolean condition based on the minv check and visited buffer check
- * @param[in] x_pos      X-coordinate of pixel that is going to be recorded, has to be within the boundary
- * @param[in] y_pos      Y-coordinate of pixel that is going to be recorded, has to be within the boundary
- * @param[in] x_cur      X-coordinate of current central pixel
- * @param[in] y_cur      Y-coordinate of current central pixel
- */
-#define check_pixel(early_test, x_pos, y_pos, x_cur, y_cur)                               \
-    {                                                                                     \
-        if(!early_test)                                                                   \
-        {                                                                                 \
-            /* Number of elements in the local stack 1, points to next available entry */ \
-            c = *((__global char *)offset(&l1_stack_counter, x_cur, y_cur));              \
-            \
-            if(c > (hysteresis_local_stack_L1 - 1)) /* Stack level 1 is full */           \
-                goto pop_stack;                                                           \
-            \
-            /* The pixel that has already been recorded is ignored */                     \
-            if(!atomic_or((__global uint *)offset(&recorded, x_pos, y_pos), 1))           \
-            {                                                                             \
-                l1_ptr[c] = (short2)(x_pos, y_pos);                                       \
-                *((__global char *)offset(&l1_stack_counter, x_cur, y_cur)) += 1;         \
-            }                                                                             \
-        }                                                                                 \
-    }
-
-/** Perform hysteresis.
- *
- * @attention The input data_type needs to be passed at compile time using -DDATA_TYPE_IN: e.g. -DDATA_TYPE_IN=short
- *
- * @param[in]  src_ptr                                        Pointer to the input image. Supported data types: U8
- * @param[in]  src_stride_x                                   Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                                     src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                                   Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                                     src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes              The offset of the first element of the output
- * @param[out] out_ptr                                        Pointer to the output image. Supported data types: U8
- * @param[in]  out_stride_x                                   Stride of the source image in X dimension (in bytes)
- * @param[in]  out_step_x                                     out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                                   Stride of the source image in Y dimension (in bytes)
- * @param[in]  out_step_y                                     out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes              The offset of the first element of the output
- * @param[out] visited_ptr                                    Pointer to the visited buffer, where pixels are marked as visited. Supported data types: U32
- * @param[in]  visited_stride_x                               Stride of the source image in X dimension (in bytes)
- * @param[in]  visited_step_x                                 visited_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  visited_stride_y                               Stride of the source image in Y dimension (in bytes)
- * @param[in]  visited_step_y                                 visited_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  visited_offset_first_element_in_bytes          The offset of the first element of the output
- * @param[out] recorded_ptr                                   Pointer to the recorded buffer, where pixels are marked as recorded. Supported data types: U32
- * @param[in]  recorded_stride_x                              Stride of the source image in X dimension (in bytes)
- * @param[in]  recorded_step_x                                recorded_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  recorded_stride_y                              Stride of the source image in Y dimension (in bytes)
- * @param[in]  recorded_step_y                                recorded_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  recorded_offset_first_element_in_bytes         The offset of the first element of the output
- * @param[out] l1_stack_ptr                                   Pointer to the l1 stack of a pixel. Supported data types: S32
- * @param[in]  l1_stack_stride_x                              Stride of the source image in X dimension (in bytes)
- * @param[in]  l1_stack_step_x                                l1_stack_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  l1_stack_stride_y                              Stride of the source image in Y dimension (in bytes)
- * @param[in]  l1_stack_step_y                                l1_stack_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  l1_stack_offset_first_element_in_bytes         The offset of the first element of the output
- * @param[out] l1_stack_counter_ptr                           Pointer to the l1 stack counters of an image. Supported data types: U8
- * @param[in]  l1_stack_counter_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  l1_stack_counter_step_x                        l1_stack_counter_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  l1_stack_counter_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  l1_stack_counter_step_y                        l1_stack_counter_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  l1_stack_counter_offset_first_element_in_bytes The offset of the first element of the output
- * @param[in]  low_thr                                        The lower threshold
- * @param[in]  up_thr                                         The upper threshold
- * @param[in]  width                                          The width of the image.
- * @param[in]  height                                         The height of the image
- */
-kernel void hysteresis(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(out),
-    IMAGE_DECLARATION(visited),
-    IMAGE_DECLARATION(recorded),
-    IMAGE_DECLARATION(l1_stack),
-    IMAGE_DECLARATION(l1_stack_counter),
-    uint low_thr,
-    uint up_thr,
-    int  width,
-    int  height)
-{
-    // Create images
-    Image src              = CONVERT_TO_IMAGE_STRUCT_NO_STEP(src);
-    Image out              = CONVERT_TO_IMAGE_STRUCT_NO_STEP(out);
-    Image visited          = CONVERT_TO_IMAGE_STRUCT_NO_STEP(visited);
-    Image recorded         = CONVERT_TO_IMAGE_STRUCT_NO_STEP(recorded);
-    Image l1_stack         = CONVERT_TO_IMAGE_STRUCT_NO_STEP(l1_stack);
-    Image l1_stack_counter = CONVERT_TO_IMAGE_STRUCT_NO_STEP(l1_stack_counter);
-
-    // Index
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    // Load value
-    DATA_TYPE_IN val = *((__global DATA_TYPE_IN *)offset(&src, x, y));
-
-    // If the pixel has already been marked as NO_EDGE, store that value in the output and return
-    if(val == NO_EDGE)
-    {
-        *offset(&out, x, y) = NO_EDGE;
-        return;
-    }
-
-    // Return if it is a MAYBE pixel. Such pixels will become edges if near a strong edge
-    if(val <= up_thr)
-    {
-        return;
-    }
-
-    // Init local stack 2
-    short2 stack_L2[hysteresis_local_stack_L2] = { 0 };
-    int    L2_counter                          = 0;
-
-    // Perform recursive hysteresis
-    while(true)
-    {
-        // Get L1 stack pointer
-        __global short2 *l1_ptr = (__global short2 *)(l1_stack.ptr + y * l1_stack.stride_y + x * hysteresis_local_stack_L1 * l1_stack.stride_x);
-
-        // If the pixel has already been visited, proceed with the items in the stack instead
-        if(atomic_or((__global uint *)offset(&visited, x, y), 1) != 0)
-        {
-            goto pop_stack;
-        }
-
-        // Set strong edge
-        *offset(&out, x, y) = EDGE;
-
-        // If it is the top of stack l2, we don't need check the surrounding pixels
-        if(L2_counter > (hysteresis_local_stack_L2 - 1))
-        {
-            goto pop_stack2;
-        }
-
-        // Points to the start of the local stack;
-        char c;
-
-        VEC_DATA_TYPE(DATA_TYPE_IN, 4)
-        x_tmp;
-        uint4 v_tmp;
-
-        // Get direction pixel indices
-        int N = max(y - 1, 0), S = min(y + 1, height - 2), W = max(x - 1, 0), E = min(x + 1, width - 2);
-
-        // Check 8 pixels around for weak edges where low_thr < val <= up_thr
-        x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, N));
-        v_tmp = vload4(0, (__global uint *)offset(&visited, W, N));
-        check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, N, x, y); // NW
-        check_pixel(((x_tmp.s1 <= low_thr) || v_tmp.s1 || (x_tmp.s1 > up_thr)), x, N, x, y); // N
-        check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, N, x, y); // NE
-
-        x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, y));
-        v_tmp = vload4(0, (__global uint *)offset(&visited, W, y));
-        check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, y, x, y); // W
-        check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, y, x, y); // E
-
-        x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, S));
-        v_tmp = vload4(0, (__global uint *)offset(&visited, W, S));
-        check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, S, x, y); // SW
-        check_pixel(((x_tmp.s1 <= low_thr) || v_tmp.s1 || (x_tmp.s1 > up_thr)), x, S, x, y); // S
-        check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, S, x, y); // SE
-
-#undef check_pixel
-
-pop_stack:
-        c = *((__global char *)offset(&l1_stack_counter, x, y));
-
-        if(c >= 1)
-        {
-            *((__global char *)offset(&l1_stack_counter, x, y)) -= 1;
-            int2 l_c = convert_int2(l1_ptr[c - 1]);
-
-            // Push the current position into level 2 stack
-            stack_L2[L2_counter].x = x;
-            stack_L2[L2_counter].y = y;
-
-            x = l_c.x;
-            y = l_c.y;
-
-            L2_counter++;
-
-            continue;
-        }
-
-        if(L2_counter > 0)
-        {
-            goto pop_stack2;
-        }
-        else
-        {
-            return;
-        }
-
-pop_stack2:
-        L2_counter--;
-        x = stack_L2[L2_counter].x;
-        y = stack_L2[L2_counter].y;
-    };
-}
diff --git a/src/core/CL/cl_kernels/channel_combine.cl b/src/core/CL/cl_kernels/channel_combine.cl
deleted file mode 100644
index 550d52e..0000000
--- a/src/core/CL/cl_kernels/channel_combine.cl
+++ /dev/null
@@ -1,416 +0,0 @@
-/*
- * Copyright (c) 2016-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function combines three planes to a single RGB image.
- *
- * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
- * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
- * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
- * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
- * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
- * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
- * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
- * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
- * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
- * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
- * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
- * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
- * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: RGB
- * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
- */
-__kernel void channel_combine_RGB888(
-    IMAGE_DECLARATION(plane0),
-    IMAGE_DECLARATION(plane1),
-    IMAGE_DECLARATION(plane2),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
-    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
-    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
-    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data0 = vload16(0, plane0.ptr);
-    uchar16 data1 = vload16(0, plane1.ptr);
-    uchar16 data2 = vload16(0, plane2.ptr);
-
-    uchar16 out0 = (uchar16)(data0.s0, data1.s0, data2.s0,
-                             data0.s1, data1.s1, data2.s1,
-                             data0.s2, data1.s2, data2.s2,
-                             data0.s3, data1.s3, data2.s3,
-                             data0.s4, data1.s4, data2.s4,
-                             data0.s5);
-    vstore16(out0, 0, dst.ptr);
-
-    uchar16 out1 = (uchar16)(data1.s5, data2.s5, data0.s6,
-                             data1.s6, data2.s6, data0.s7,
-                             data1.s7, data2.s7, data0.s8,
-                             data1.s8, data2.s8, data0.s9,
-                             data1.s9, data2.s9, data0.sA,
-                             data1.sA);
-    vstore16(out1, 0, dst.ptr + 16);
-
-    uchar16 out2 = (uchar16)(data2.sA, data0.sB, data1.sB,
-                             data2.sB, data0.sC, data1.sC,
-                             data2.sC, data0.sD, data1.sD,
-                             data2.sD, data0.sE, data1.sE,
-                             data2.sE, data0.sF, data1.sF,
-                             data2.sF);
-    vstore16(out2, 0, dst.ptr + 32);
-}
-
-/** This function combines three planes to a single RGBA image.
- *
- * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
- * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
- * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
- * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
- * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
- * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
- * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
- * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
- * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
- * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
- * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
- * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
- * @param[in] plane3_ptr                           Pointer to the fourth plane. Supported Format: U8
- * @param[in] plane3_stride_x                      Stride of the fourth plane in X dimension (in bytes)
- * @param[in] plane3_step_x                        plane3_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane3_stride_y                      Stride of the fourth plane in Y dimension (in bytes)
- * @param[in] plane3_step_y                        plane3_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane3_offset_first_element_in_bytes The offset of the first element in the fourth plane
- * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: RGBA
- * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
- */
-__kernel void channel_combine_RGBA8888(
-    IMAGE_DECLARATION(plane0),
-    IMAGE_DECLARATION(plane1),
-    IMAGE_DECLARATION(plane2),
-    IMAGE_DECLARATION(plane3),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
-    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
-    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
-    Image plane3 = CONVERT_TO_IMAGE_STRUCT(plane3);
-    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data0 = vload16(0, plane0.ptr);
-    uchar16 data1 = vload16(0, plane1.ptr);
-    uchar16 data2 = vload16(0, plane2.ptr);
-    uchar16 data3 = vload16(0, plane3.ptr);
-
-    uchar16 out0 = (uchar16)(data0.s0, data1.s0, data2.s0, data3.s0,
-                             data0.s1, data1.s1, data2.s1, data3.s1,
-                             data0.s2, data1.s2, data2.s2, data3.s2,
-                             data0.s3, data1.s3, data2.s3, data3.s3);
-    vstore16(out0, 0, dst.ptr);
-
-    uchar16 out1 = (uchar16)(data0.s4, data1.s4, data2.s4, data3.s4,
-                             data0.s5, data1.s5, data2.s5, data3.s5,
-                             data0.s6, data1.s6, data2.s6, data3.s6,
-                             data0.s7, data1.s7, data2.s7, data3.s7);
-    vstore16(out1, 0, dst.ptr + 16);
-
-    uchar16 out2 = (uchar16)(data0.s8, data1.s8, data2.s8, data3.s8,
-                             data0.s9, data1.s9, data2.s9, data3.s9,
-                             data0.sA, data1.sA, data2.sA, data3.sA,
-                             data0.sB, data1.sB, data2.sB, data3.sB);
-    vstore16(out2, 0, dst.ptr + 32);
-
-    uchar16 out3 = (uchar16)(data0.sC, data1.sC, data2.sC, data3.sC,
-                             data0.sD, data1.sD, data2.sD, data3.sD,
-                             data0.sE, data1.sE, data2.sE, data3.sE,
-                             data0.sF, data1.sF, data2.sF, data3.sF);
-    vstore16(out3, 0, dst.ptr + 48);
-}
-
-/** This function combines three planes to a single YUYV image.
- *
- * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
- * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
- * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
- * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
- * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
- * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
- * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
- * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
- * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
- * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
- * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
- * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
- * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: YUYV
- * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
- */
-__kernel void channel_combine_YUYV422(
-    IMAGE_DECLARATION(plane0),
-    IMAGE_DECLARATION(plane1),
-    IMAGE_DECLARATION(plane2),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
-    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
-    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
-    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data0 = vload16(0, plane0.ptr);
-    uchar8  data1 = vload8(0, plane1.ptr);
-    uchar8  data2 = vload8(0, plane2.ptr);
-
-    uchar16 out0 = (uchar16)(data0.s0, data1.s0, data0.s1, data2.s0,
-                             data0.s2, data1.s1, data0.s3, data2.s1,
-                             data0.s4, data1.s2, data0.s5, data2.s2,
-                             data0.s6, data1.s3, data0.s7, data2.s3);
-    vstore16(out0, 0, dst.ptr);
-    uchar16 out1 = (uchar16)(data0.s8, data1.s4, data0.s9, data2.s4,
-                             data0.sA, data1.s5, data0.sB, data2.s5,
-                             data0.sC, data1.s6, data0.sD, data2.s6,
-                             data0.sE, data1.s7, data0.sF, data2.s7);
-    vstore16(out1, 0, dst.ptr + 16);
-}
-
-/** This function combines three planes to a single UYUV image.
- *
- * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
- * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
- * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
- * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
- * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
- * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
- * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
- * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
- * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
- * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
- * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
- * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
- * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: UYUV
- * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
- */
-__kernel void channel_combine_UYVY422(
-    IMAGE_DECLARATION(plane0),
-    IMAGE_DECLARATION(plane1),
-    IMAGE_DECLARATION(plane2),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
-    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
-    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
-    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data0 = vload16(0, plane0.ptr);
-    uchar8  data1 = vload8(0, plane1.ptr);
-    uchar8  data2 = vload8(0, plane2.ptr);
-
-    uchar16 out0 = (uchar16)(data1.s0, data0.s0, data2.s0, data0.s1,
-                             data1.s1, data0.s2, data2.s1, data0.s3,
-                             data1.s2, data0.s4, data2.s2, data0.s5,
-                             data1.s3, data0.s6, data2.s3, data0.s7);
-    vstore16(out0, 0, dst.ptr);
-    uchar16 out1 = (uchar16)(data1.s4, data0.s8, data2.s4, data0.s9,
-                             data1.s5, data0.sA, data2.s5, data0.sB,
-                             data1.s6, data0.sC, data2.s6, data0.sD,
-                             data1.s7, data0.sE, data2.s7, data0.sF);
-    vstore16(out1, 0, dst.ptr + 16);
-}
-
-/** This function combines three planes to a single NV12/NV21 image.
- *
- * @note NV12 or NV21 has to be specified through preprocessor macro. eg. -DNV12 performs NV12 channel combine.
- *
- * @param[in] src_plane0_ptr                           Pointer to the first plane. Supported Format: U8
- * @param[in] src_plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
- * @param[in] src_plane0_step_x                        src_plane0_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
- * @param[in] src_plane0_step_y                        src_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_plane0_offset_first_element_in_bytes The offset of the first element in the first plane
- * @param[in] src_plane1_ptr                           Pointer to the second plane. Supported Format: U8
- * @param[in] src_plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
- * @param[in] src_plane1_step_x                        src_plane1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
- * @param[in] src_plane1_step_y                        src_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_plane1_offset_first_element_in_bytes The offset of the first element in the second plane
- * @param[in] src_plane2_ptr                           Pointer to the third plane. Supported Format: U8
- * @param[in] src_plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
- * @param[in] src_plane2_step_x                        src_plane2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
- * @param[in] src_plane2_step_y                        src_plane2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_plane2_offset_first_element_in_bytes The offset of the first element in the third plane
- * @param[in] dst_plane0_ptr                           Pointer to the first plane of the destination image. Supported Format: U8
- * @param[in] dst_plane0_stride_x                      Stride of the first plane of the destination image in X dimension (in bytes)
- * @param[in] dst_plane0_step_x                        dst_plane0_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_plane0_stride_y                      Stride of the first plane of the destination image in Y dimension (in bytes)
- * @param[in] dst_plane0_step_y                        dst_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_plane0_offset_first_element_in_bytes The offset of the first element in the first plane of the destination image
- * @param[in] dst_plane1_ptr                           Pointer to the second plane of the destination image. Supported Format: UV88
- * @param[in] dst_plane1_stride_x                      Stride of the second plane of the destination image in X dimension (in bytes)
- * @param[in] dst_plane1_step_x                        dst_plane1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_plane1_stride_y                      Stride of the second plane of the destination image in Y dimension (in bytes)
- * @param[in] dst_plane1_step_y                        dst_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_plane1_offset_first_element_in_bytes The offset of the first element in the second plane of the destination image
- * @param[in] height                                   Sub-sampled height
- */
-__kernel void channel_combine_NV(
-    IMAGE_DECLARATION(src_plane0),
-    IMAGE_DECLARATION(src_plane1),
-    IMAGE_DECLARATION(src_plane2),
-    IMAGE_DECLARATION(dst_plane0),
-    IMAGE_DECLARATION(dst_plane1),
-    uint height)
-{
-    // Get pixels pointer
-    Image src_plane0 = CONVERT_TO_IMAGE_STRUCT(src_plane0);
-    Image src_plane1 = CONVERT_TO_IMAGE_STRUCT(src_plane1);
-    Image src_plane2 = CONVERT_TO_IMAGE_STRUCT(src_plane2);
-    Image dst_plane0 = CONVERT_TO_IMAGE_STRUCT(dst_plane0);
-    Image dst_plane1 = CONVERT_TO_IMAGE_STRUCT(dst_plane1);
-
-    // Copy plane data
-    vstore16(vload16(0, src_plane0.ptr), 0, dst_plane0.ptr);
-    vstore16(vload16(0, offset(&src_plane0, 0, height)), 0, (__global uchar *)offset(&dst_plane0, 0, height));
-
-    // Create UV place
-    uchar8 data1 = vload8(0, src_plane1.ptr);
-    uchar8 data2 = vload8(0, src_plane2.ptr);
-
-#ifdef NV12
-    vstore16(shuffle2(data1, data2, (uchar16)(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)), 0, dst_plane1.ptr);
-#elif defined(NV21)
-    vstore16(shuffle2(data2, data1, (uchar16)(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)), 0, dst_plane1.ptr);
-#endif /* NV12 or NV21 */
-}
-
-/** This function combines three planes to a single YUV444 or IYUV image.
- *
- * @note YUV444 or IYUV has to be specified through preprocessor macro. eg. -DIYUV performs IYUV channel combine.
- *
- * @param[in] src_plane0_ptr                           Pointer to the first plane. Supported Format: U8
- * @param[in] src_plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
- * @param[in] src_plane0_step_x                        src_plane0_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
- * @param[in] src_plane0_step_y                        src_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_plane0_offset_first_element_in_bytes The offset of the first element in the first plane
- * @param[in] src_plane1_ptr                           Pointer to the second plane. Supported Format: U8
- * @param[in] src_plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
- * @param[in] src_plane1_step_x                        src_plane1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
- * @param[in] src_plane1_step_y                        src_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_plane1_offset_first_element_in_bytes The offset of the first element in the second plane
- * @param[in] src_plane2_ptr                           Pointer to the third plane. Supported Format: U8
- * @param[in] src_plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
- * @param[in] src_plane2_step_x                        src_plane2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
- * @param[in] src_plane2_step_y                        src_plane2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_plane2_offset_first_element_in_bytes The offset of the first element in the third plane
- * @param[in] dst_plane0_ptr                           Pointer to the first plane of the destination image. Supported Format: U8
- * @param[in] dst_plane0_stride_x                      Stride of the first plane of the destination image in X dimension (in bytes)
- * @param[in] dst_plane0_step_x                        dst_plane0_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_plane0_stride_y                      Stride of the first plane of the destination image in Y dimension (in bytes)
- * @param[in] dst_plane0_step_y                        dst_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_plane0_offset_first_element_in_bytes The offset of the first element in the first plane of the destination image
- * @param[in] dst_plane1_ptr                           Pointer to the second plane of the destination image. Supported Format: U8
- * @param[in] dst_plane1_stride_x                      Stride of the second plane of the destination image in X dimension (in bytes)
- * @param[in] dst_plane1_step_x                        dst_plane1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_plane1_stride_y                      Stride of the second plane of the destination image in Y dimension (in bytes)
- * @param[in] dst_plane1_step_y                        dst_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_plane1_offset_first_element_in_bytes The offset of the first element in the second plane of the destination image
- * @param[in] dst_plane2_ptr                           Pointer to the third plane of the destination image. Supported Format: U8
- * @param[in] dst_plane2_stride_x                      Stride of the third plane of the destination image in X dimension (in bytes)
- * @param[in] dst_plane2_step_x                        dst_plane2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_plane2_stride_y                      Stride of the third plane of the destination image in Y dimension (in bytes)
- * @param[in] dst_plane2_step_y                        dst_plane2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_plane2_offset_first_element_in_bytes The offset of the first element in the third plane of the destination image
- * @param[in] height                                   Sub-sampled height
- */
-__kernel void copy_planes_3p(
-    IMAGE_DECLARATION(src_plane0),
-    IMAGE_DECLARATION(src_plane1),
-    IMAGE_DECLARATION(src_plane2),
-    IMAGE_DECLARATION(dst_plane0),
-    IMAGE_DECLARATION(dst_plane1),
-    IMAGE_DECLARATION(dst_plane2),
-    uint height)
-{
-    // Get pixels pointer
-    Image src_plane0 = CONVERT_TO_IMAGE_STRUCT(src_plane0);
-    Image src_plane1 = CONVERT_TO_IMAGE_STRUCT(src_plane1);
-    Image src_plane2 = CONVERT_TO_IMAGE_STRUCT(src_plane2);
-    Image dst_plane0 = CONVERT_TO_IMAGE_STRUCT(dst_plane0);
-    Image dst_plane1 = CONVERT_TO_IMAGE_STRUCT(dst_plane1);
-    Image dst_plane2 = CONVERT_TO_IMAGE_STRUCT(dst_plane2);
-
-    // Copy plane data
-    vstore16(vload16(0, src_plane0.ptr), 0, dst_plane0.ptr);
-#ifdef YUV444
-    vstore16(vload16(0, src_plane1.ptr), 0, dst_plane1.ptr);
-    vstore16(vload16(0, src_plane2.ptr), 0, dst_plane2.ptr);
-#elif defined(IYUV)
-    vstore16(vload16(0, offset(&src_plane0, 0, height)), 0, (__global uchar *)offset(&dst_plane0, 0, height));
-    vstore8(vload8(0, src_plane1.ptr), 0, dst_plane1.ptr);
-    vstore8(vload8(0, src_plane2.ptr), 0, dst_plane2.ptr);
-#endif /* YUV444 or IYUV */
-}
diff --git a/src/core/CL/cl_kernels/channel_extract.cl b/src/core/CL/cl_kernels/channel_extract.cl
deleted file mode 100644
index b64f248..0000000
--- a/src/core/CL/cl_kernels/channel_extract.cl
+++ /dev/null
@@ -1,272 +0,0 @@
-/*
- * Copyright (c) 2016-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function extracts a given channel from an RGB image.
- *
- * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_B will extract the B channel.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported Format: RGB
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void channel_extract_RGB888(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data  = vload16(0, src.ptr);
-    uchar8  data2 = vload8(0, src.ptr + 16);
-
-#ifdef CHANNEL_R
-    vstore4(data.s0369, 0, dst.ptr);
-    vstore4((uchar4)(data.sCF, data2.s25), 0, dst.ptr + 4);
-#elif defined(CHANNEL_G)
-    vstore4(data.s147A, 0, dst.ptr);
-    vstore4((uchar4)(data.sD, data2.s036), 0, dst.ptr + 4);
-#elif defined(CHANNEL_B)
-    vstore4(data.s258B, 0, dst.ptr);
-    vstore4((uchar4)(data.sE, data2.s147), 0, dst.ptr + 4);
-#endif /* CHANNEL_R or CHANNEL_G or CHANNEL_B */
-}
-
-/** This function extracts a given channel from an RGBA image.
- *
- * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_B will extract the B channel.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported Format: RGBA
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void channel_extract_RGBA8888(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data  = vload16(0, src.ptr);
-    uchar16 data2 = vload16(0, src.ptr + 16);
-
-#ifdef CHANNEL_R
-    vstore8((uchar8)(data.s048C, data2.s048C), 0, dst.ptr);
-#elif defined(CHANNEL_G)
-    vstore8((uchar8)(data.s159D, data2.s159D), 0, dst.ptr);
-#elif defined(CHANNEL_B)
-    vstore8((uchar8)(data.s26AE, data2.s26AE), 0, dst.ptr);
-#elif defined(CHANNEL_A)
-    vstore8((uchar8)(data.s37BF, data2.s37BF), 0, dst.ptr);
-#endif /* CHANNEL_R or CHANNEL_G or CHANNEL_B or CHANNEL_A */
-}
-
-/** This function extracts a given channel from an YUYV image.
- *
- * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported Format: YUYV
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void channel_extract_YUYV422(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data = vload16(0, src.ptr);
-
-#ifdef CHANNEL_Y
-    vstore8(data.s02468ACE, 0, dst.ptr);
-#elif defined(CHANNEL_U)
-    vstore4(data.s159D, 0, dst.ptr);
-#elif defined(CHANNEL_V)
-    vstore4(data.s37BF, 0, dst.ptr);
-#endif /* CHANNEL_Y or CHANNEL_U or CHANNEL_V */
-}
-
-/** This function extracts a given channel from an UYUV image.
- *
- * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported Format: UYUV
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void channel_extract_UYVY422(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data = vload16(0, src.ptr);
-
-#ifdef CHANNEL_Y
-    vstore8(data.s13579BDF, 0, dst.ptr);
-#elif defined(CHANNEL_U)
-    vstore4(data.s048C, 0, dst.ptr);
-#elif defined(CHANNEL_V)
-    vstore4(data.s26AE, 0, dst.ptr);
-#endif /* CHANNEL_Y or CHANNEL_U or CHANNEL_V */
-}
-
-/** This function extracts a given channel from an NV12 image.
- *
- * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
- * @warning Only channels UV can be extracted using this kernel.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported Format: NV12 (UV88)
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void channel_extract_NV12(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data = vload16(0, src.ptr);
-
-#ifdef CHANNEL_U
-    vstore8(data.s02468ACE, 0, dst.ptr);
-#elif defined(CHANNEL_V)
-    vstore8(data.s13579BDF, 0, dst.ptr);
-#endif /* CHANNEL_U or CHANNEL_V */
-}
-
-/** This function extracts a given channel from an NV21 image.
- *
- * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
- * @warning Only channels UV can be extracted using this kernel.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported Format: NV21 (UV88)
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void channel_extract_NV21(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data = vload16(0, src.ptr);
-
-#ifdef CHANNEL_U
-    vstore8(data.s13579BDF, 0, dst.ptr);
-#elif defined(CHANNEL_V)
-    vstore8(data.s02468ACE, 0, dst.ptr);
-#endif /* CHANNEL_U or CHANNEL_V */
-}
-
-/** This function extracts a given plane from an multi-planar image.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported Format: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void copy_plane(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Copy plane data
-    vstore8(vload8(0, src.ptr), 0, dst.ptr);
-}
diff --git a/src/core/CL/cl_kernels/color_convert.cl b/src/core/CL/cl_kernels/color_convert.cl
deleted file mode 100644
index cbebc88..0000000
--- a/src/core/CL/cl_kernels/color_convert.cl
+++ /dev/null
@@ -1,1911 +0,0 @@
-/*
- * Copyright (c) 2016-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Convert an RGB888 image to RGBX8888
- *
- * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
- * No offset.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void RGB888_to_RGBA8888_bt709(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-
-    // handle 16 pixels every time
-    uchar16 rgb_0 = vload16(0, in.ptr);
-    uchar16 rgb_1 = vload16(0, in.ptr + 16);
-    uchar16 rgb_2 = vload16(0, in.ptr + 32);
-
-    uchar16 rgba_0 = (uchar16)(rgb_0.s012, 255, rgb_0.s345, 255, rgb_0.s678, 255, rgb_0.s9ab, 255);
-    uchar16 rgba_1 = (uchar16)(rgb_0.scde, 255, rgb_0.sf, rgb_1.s01, 255, rgb_1.s234, 255, rgb_1.s567, 255);
-    uchar16 rgba_2 = (uchar16)(rgb_1.s89a, 255, rgb_1.sbcd, 255, rgb_1.sef, rgb_2.s0, 255, rgb_2.s123, 255);
-    uchar16 rgba_3 = (uchar16)(rgb_2.s456, 255, rgb_2.s789, 255, rgb_2.sabc, 255, rgb_2.sdef, 255);
-
-    vstore16(rgba_0, 0, out.ptr);
-    vstore16(rgba_1, 0, out.ptr + 16);
-    vstore16(rgba_2, 0, out.ptr + 32);
-    vstore16(rgba_3, 0, out.ptr + 48);
-}
-
-/** Convert an RGB888 image to U8
- *
- * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
- * No offset.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported Format: RGB888
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void RGB888_to_U8_bt709(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-
-    // handle 16 pixels every time
-    const uchar16 rgb_0 = vload16(0, in.ptr);
-    const uchar16 rgb_1 = vload16(0, in.ptr + 16);
-    const uchar16 rgb_2 = vload16(0, in.ptr + 32);
-
-    //Resequence values from a sequence of 16 RGB values to sequence of 16 R, 16 G, 16 B values
-    const uchar16 rgb_r = (uchar16)(rgb_0.s0369, rgb_0.scf, rgb_1.s258b, rgb_1.se, rgb_2.s147a, rgb_2.sd);
-    const uchar16 rgb_g = (uchar16)(rgb_0.s147a, rgb_0.sd, rgb_1.s0369, rgb_1.scf, rgb_2.s258b, rgb_2.se);
-    const uchar16 rgb_b = (uchar16)(rgb_0.s258b, rgb_0.se, rgb_1.s147a, rgb_1.sd, rgb_2.s0369, rgb_2.scf);
-
-    const float16 rgb2u8_red_coef_bt709   = 0.2126f;
-    const float16 rgb2u8_green_coef_bt709 = 0.7152f;
-    const float16 rgb2u8_blue_coef_bt709  = 0.0722f;
-
-    //Computation of 16 greyscale values in float
-    const float16 greyscale_f_0 = rgb2u8_red_coef_bt709 * convert_float16(rgb_r) + rgb2u8_green_coef_bt709 * convert_float16(rgb_g) + rgb2u8_blue_coef_bt709 * convert_float16(rgb_b);
-
-    //Convert it to 16 grayscale uchar values
-    const uchar16 greyscale_u8_0 = convert_uchar16_sat_rtz(greyscale_f_0);
-
-    vstore16(greyscale_u8_0, 0, out.ptr);
-}
-
-/** Convert an RGB888 image to RGBX8888
- *
- * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
- * No offset.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void RGBA8888_to_RGB888_bt709(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-    // handle 16 pixels every time
-    uchar16 rgba_0 = vload16(0, in.ptr);
-    uchar16 rgba_1 = vload16(0, in.ptr + 16);
-    uchar16 rgba_2 = vload16(0, in.ptr + 32);
-    uchar16 rgba_3 = vload16(0, in.ptr + 48);
-
-    uchar16 rgb_0 = (uchar16)(rgba_0.s01245689, rgba_0.sacde, rgba_1.s0124);
-    uchar16 rgb_1 = (uchar16)(rgba_1.s5689acde, rgba_2.s01245689);
-    uchar16 rgb_2 = (uchar16)(rgba_2.sacde, rgba_3.s01245689, rgba_3.sacde);
-
-    vstore16(rgb_0, 0, out.ptr);
-    vstore16(rgb_1, 0, out.ptr + 16);
-    vstore16(rgb_2, 0, out.ptr + 32);
-}
-
-/** Convert a UYVY422 image to RGB888 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
- * No offset.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void UYVY422_to_RGB888_bt709(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-
-    // handle 8 pixels every time
-    uchar16 uyvy = vload16(0, in.ptr);
-
-    uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
-    char8  cb   = (char8)(uyvy.s0, uyvy.s0, uyvy.s4, uyvy.s4, uyvy.s8, uyvy.s8, uyvy.sc, uyvy.sc) - (char8)(128);
-    char8  cr   = (char8)(uyvy.s2, uyvy.s2, uyvy.s6, uyvy.s6, uyvy.sa, uyvy.sa, uyvy.se, uyvy.se) - (char8)(128);
-
-    float8 red_coef_bt709    = (float8)(1.5748f);
-    float8 green_coef_bt709  = (float8)(-0.1873f);
-    float8 green_coef2_bt709 = (float8)(-0.4681f);
-    float8 blue_coef_bt709   = (float8)(1.8556f);
-    float8 lumav             = convert_float8(luma);
-
-    float8 f_r = red_coef_bt709 * convert_float8(cr);
-    float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr);
-    float8 f_b = blue_coef_bt709 * convert_float8(cb);
-
-    f_r += lumav;
-    f_g += lumav;
-    f_b += lumav;
-
-    uchar8 r_0 = convert_uchar8_sat_rtz(f_r);
-    uchar8 g_0 = convert_uchar8_sat_rtz(f_g);
-    uchar8 b_0 = convert_uchar8_sat_rtz(f_b);
-
-    uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2,
-                              r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5);
-    uchar8 rgb_1 = (uchar8)(g_0.s5, b_0.s5, r_0.s6, g_0.s6, b_0.s6, r_0.s7, g_0.s7, b_0.s7);
-
-    vstore16(rgb_0, 0, out.ptr);
-    vstore8(rgb_1, 0, out.ptr + 16);
-}
-
-/** Convert a UYVY422 image to RGBX8888 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
- * No offset.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void UYVY422_to_RGBA8888_bt709(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-
-    // handle 8 pixels every time
-    uchar16 uyvy = vload16(0, in.ptr);
-
-    uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
-    char8  cb   = (char8)(uyvy.s0, uyvy.s0, uyvy.s4, uyvy.s4, uyvy.s8, uyvy.s8, uyvy.sc, uyvy.sc) - (char8)(128);
-    char8  cr   = (char8)(uyvy.s2, uyvy.s2, uyvy.s6, uyvy.s6, uyvy.sa, uyvy.sa, uyvy.se, uyvy.se) - (char8)(128);
-
-    float8 red_coef_bt709    = (float8)(1.5748f);
-    float8 green_coef_bt709  = (float8)(-0.1873f);
-    float8 green_coef2_bt709 = (float8)(-0.4681f);
-    float8 blue_coef_bt709   = (float8)(1.8556f);
-    float8 lumav             = convert_float8(luma);
-
-    float8 f_r = red_coef_bt709 * convert_float8(cr);
-    float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr);
-    float8 f_b = blue_coef_bt709 * convert_float8(cb);
-
-    f_r += lumav;
-    f_g += lumav;
-    f_b += lumav;
-
-    uchar8 r_0 = convert_uchar8_sat_rtz(f_r);
-    uchar8 g_0 = convert_uchar8_sat_rtz(f_g);
-    uchar8 b_0 = convert_uchar8_sat_rtz(f_b);
-
-    uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255,
-                               r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
-    uchar16 rgba_1 = (uchar16)(r_0.s4, g_0.s4, b_0.s4, 255, r_0.s5, g_0.s5, b_0.s5, 255,
-                               r_0.s6, g_0.s6, b_0.s6, 255, r_0.s7, g_0.s7, b_0.s7, 255);
-
-    vstore16(rgba_0, 0, out.ptr);
-    vstore16(rgba_1, 0, out.ptr + 16);
-}
-
-/** Convert a YUYV422 image to RGB888 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
- * No offset.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void YUYV422_to_RGB888_bt709(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-
-    // handle 8 pixels every time
-    uchar16 uyvy = vload16(0, in.ptr);
-
-    uchar8 luma = (uchar8)(uyvy.s0, uyvy.s2, uyvy.s4, uyvy.s6, uyvy.s8, uyvy.sa, uyvy.sc, uyvy.se);
-    char8  cb   = (char8)(uyvy.s1, uyvy.s1, uyvy.s5, uyvy.s5, uyvy.s9, uyvy.s9, uyvy.sd, uyvy.sd) - (char8)(128);
-    char8  cr   = (char8)(uyvy.s3, uyvy.s3, uyvy.s7, uyvy.s7, uyvy.sb, uyvy.sb, uyvy.sf, uyvy.sf) - (char8)(128);
-
-    float8 red_coef_bt709    = (float8)(1.5748f);
-    float8 green_coef_bt709  = (float8)(-0.1873f);
-    float8 green_coef2_bt709 = (float8)(-0.4681f);
-    float8 blue_coef_bt709   = (float8)(1.8556f);
-    float8 lumav             = convert_float8(luma);
-
-    float8 f_r = red_coef_bt709 * convert_float8(cr);
-    float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr);
-    float8 f_b = blue_coef_bt709 * convert_float8(cb);
-
-    f_r += lumav;
-    f_g += lumav;
-    f_b += lumav;
-
-    uchar8 r_0 = convert_uchar8_sat_rtz(f_r);
-    uchar8 g_0 = convert_uchar8_sat_rtz(f_g);
-    uchar8 b_0 = convert_uchar8_sat_rtz(f_b);
-
-    uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2,
-                              r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5);
-    uchar8 rgb_1 = (uchar8)(g_0.s5, b_0.s5, r_0.s6, g_0.s6, b_0.s6, r_0.s7, g_0.s7, b_0.s7);
-
-    vstore16(rgb_0, 0, out.ptr);
-    vstore8(rgb_1, 0, out.ptr + 16);
-}
-
-/** Convert a YUYV422 image to RGBX8888 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
- * No offset.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void YUYV422_to_RGBA8888_bt709(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-
-    // handle 8 pixels every time
-    uchar16 uyvy = vload16(0, in.ptr);
-
-    uchar8 luma = (uchar8)(uyvy.s0, uyvy.s2, uyvy.s4, uyvy.s6, uyvy.s8, uyvy.sa, uyvy.sc, uyvy.se);
-    char8  cb   = (char8)(uyvy.s1, uyvy.s1, uyvy.s5, uyvy.s5, uyvy.s9, uyvy.s9, uyvy.sd, uyvy.sd) - (char8)(128);
-    char8  cr   = (char8)(uyvy.s3, uyvy.s3, uyvy.s7, uyvy.s7, uyvy.sb, uyvy.sb, uyvy.sf, uyvy.sf) - (char8)(128);
-
-    float8 red_coef_bt709    = (float8)(1.5748f);
-    float8 green_coef_bt709  = (float8)(-0.1873f);
-    float8 green_coef2_bt709 = (float8)(-0.4681f);
-    float8 blue_coef_bt709   = (float8)(1.8556f);
-    float8 lumav             = convert_float8(luma);
-
-    float8 f_r = red_coef_bt709 * convert_float8(cr);
-    float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr);
-    float8 f_b = blue_coef_bt709 * convert_float8(cb);
-
-    f_r += lumav;
-    f_g += lumav;
-    f_b += lumav;
-
-    uchar8 r_0 = convert_uchar8_sat_rtz(f_r);
-    uchar8 g_0 = convert_uchar8_sat_rtz(f_g);
-    uchar8 b_0 = convert_uchar8_sat_rtz(f_b);
-
-    uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255,
-                               r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
-    uchar16 rgba_1 = (uchar16)(r_0.s4, g_0.s4, b_0.s4, 255, r_0.s5, g_0.s5, b_0.s5, 255,
-                               r_0.s6, g_0.s6, b_0.s6, 255, r_0.s7, g_0.s7, b_0.s7, 255);
-
-    vstore16(rgba_0, 0, out.ptr);
-    vstore16(rgba_1, 0, out.ptr + 16);
-}
-
-/** Convert a RGB image to NV12 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  input_ptr                           Pointer to the source image. Supported Format: U8
- * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] luma_ptr                            Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_stride_x                       Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_step_x                         luma_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_stride_y                       Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_step_y                         luma_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_offset_first_element_in_bytes  The offset of the first element in the destination image luma channel
- * @param[out] uv_ptr                              Pointer to the destination uv channel. Supported Format: U8
- * @param[in]  uv_stride_x                         Stride of the destination uv channel in X dimension (in bytes)
- * @param[in]  uv_step_x                           uv_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_stride_y                         Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  uv_step_y                           uv_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_offset_first_element_in_bytes    The offset of the first element in the destination image uv channel
- *
- */
-__kernel void RGB888_to_NV12_bt709(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(luma),
-    IMAGE_DECLARATION(uv))
-{
-    Image in     = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma);
-    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv);
-
-    // handle 4 pixels every time, two lines, each line for 2 pixels
-    // Read 2 pixel of the first line
-    uchar8 rgb_0 = vload8(0, in.ptr);
-    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s3);
-    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s4);
-    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s5);
-
-    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
-    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
-    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
-
-    short2 i_y = convert_short2_rtz(f_y);
-    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
-    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
-
-    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
-    vstore2(luma_0, 0, out_y.ptr);
-
-    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
-    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
-
-    // Read 2 pixel of the second line
-    uchar8 rgb_1 = vload8(0, in.ptr + input_stride_y);
-    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s3);
-    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s4);
-    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s5);
-
-    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
-    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
-    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
-
-    i_y = convert_short2_rtz(f_y);
-    i_u = convert_short2_rtz(f_u) + (short2)(128);
-    i_v = convert_short2_rtz(f_v) + (short2)(128);
-
-    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
-    vstore2(luma_1, 0, out_y.ptr + luma_stride_y);
-
-    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
-    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
-    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
-                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
-
-    vstore2(cbcr, 0, out_uv.ptr);
-}
-
-/*
-    R'= Y' + 0.0000*U + 1.5748*V
-    G'= Y' - 0.1873*U - 0.4681*V
-    B'= Y' + 1.8556*U + 0.0000*V
-*/
-
-/** Convert an NV12 image to RGB888
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  uv_input_ptr                             Pointer to the source uv channel. Supported Format: U8
- * @param[in]  uv_input_stride_x                        Stride of the source image uv channel in X dimension (in bytes)
- * @param[in]  uv_input_step_x                          uv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_input_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  uv_input_step_y                          uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_input_offset_first_element_in_bytes   The offset of the first element in the source image
- * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  rgb_output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void NV12_to_RGB888_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(uv_input),
-    IMAGE_DECLARATION(rgb_output))
-{
-    Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
-    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
-
-    // handle 8 pixels every time, two lines, each line for 4 pixels
-    uchar4 luma_0 = vload4(0, in_luma.ptr);
-    uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
-    uchar4 cbcr   = vload4(0, in_uv.ptr);
-    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
-    char4  cr     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
-
-    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
-    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
-    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
-
-    float4 f_r = convert_float4(luma_0) + temp0;
-    float4 f_g = convert_float4(luma_0) + temp1;
-    float4 f_b = convert_float4(luma_0) + temp2;
-
-    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
-    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
-    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
-
-    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
-    uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
-    vstore8(rgb_0, 0, out_rgb.ptr);
-    vstore4(rgb_1, 0, out_rgb.ptr + 8);
-
-    f_r = convert_float4(luma_1) + temp0;
-    f_g = convert_float4(luma_1) + temp1;
-    f_b = convert_float4(luma_1) + temp2;
-
-    r_0 = convert_uchar4_sat_rtz(f_r);
-    g_0 = convert_uchar4_sat_rtz(f_g);
-    b_0 = convert_uchar4_sat_rtz(f_b);
-
-    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
-    rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
-    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
-    vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
-}
-
-/** Convert a RGB image to YUV444 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  rgb_input_ptr                             Pointer to the source image. Supported Format: U8
- * @param[in]  rgb_input_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  rgb_input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgb_input_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  rgb_input_step_y                          rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgb_input_offset_first_element_in_bytes   The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination image V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- *
- */
-__kernel void RGB888_to_YUV444_bt709(
-    IMAGE_DECLARATION(rgb_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    // handle 4 pixels every time
-    Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_input);
-    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u  = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v  = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // Read 4 pixel
-    uchar16 rgb_0 = vload16(0, in_rgb.ptr);
-    uchar4  r_0   = (uchar4)(rgb_0.s0, rgb_0.s3, rgb_0.s6, rgb_0.s9);
-    uchar4  g_0   = (uchar4)(rgb_0.s1, rgb_0.s4, rgb_0.s7, rgb_0.sa);
-    uchar4  b_0   = (uchar4)(rgb_0.s2, rgb_0.s5, rgb_0.s8, rgb_0.sb);
-
-    float4 f_y = (float4)(0.0000f) + (float4)(0.2126f) * convert_float4(r_0) + (float4)(0.7152f) * convert_float4(g_0) + (float4)(0.0722f) * convert_float4(b_0);
-    float4 f_u = (float4)(0.0000f) - (float4)(0.1146f) * convert_float4(r_0) - (float4)(0.3854f) * convert_float4(g_0) + (float4)(0.5000f) * convert_float4(b_0);
-    float4 f_v = (float4)(0.0000f) + (float4)(0.5000f) * convert_float4(r_0) - (float4)(0.4542f) * convert_float4(g_0) - (float4)(0.0458f) * convert_float4(b_0);
-
-    short4 i_y = convert_short4_rtz(f_y);
-    short4 i_u = convert_short4_rtz(f_u) + (short4)(128);
-    short4 i_v = convert_short4_rtz(f_v) + (short4)(128);
-
-    uchar4 luma_0 = convert_uchar4(max((short4)(0), min(i_y, (short4)(255))));
-    vstore4(luma_0, 0, out_y.ptr);
-
-    uchar4 cb_0 = convert_uchar4(max((short4)(0), min(i_u, (short4)(255))));
-    uchar4 cr_0 = convert_uchar4(max((short4)(0), min(i_v, (short4)(255))));
-    vstore4(cb_0, 0, out_u.ptr);
-    vstore4(cr_0, 0, out_v.ptr);
-}
-
-/** Convert a RGB image to IYUV using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
- * No offset.
- *
- * @param[in]  rgb_input_ptr                             Pointer to the source image. Supported Format: U8
- * @param[in]  rgb_input_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  rgb_input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgb_input_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  rgb_input_step_y                          rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgb_input_offset_first_element_in_bytes   The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- *
- */
-__kernel void RGB888_to_IYUV_bt709(
-    IMAGE_DECLARATION(rgb_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    // handle 4 pixels every time, two lines, each line for 2 pixels
-    Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_input);
-    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u  = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v  = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // Read 2 pixel of the first line
-    uchar8 rgb_0 = vload8(0, in_rgb.ptr);
-    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s3);
-    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s4);
-    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s5);
-
-    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
-    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
-    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
-
-    short2 i_y = convert_short2_rtz(f_y);
-    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
-    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
-
-    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
-    vstore2(luma_0, 0, out_y.ptr);
-
-    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
-    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
-
-    // Read 2 pixel of the second line
-    uchar8 rgb_1 = vload8(0, in_rgb.ptr + rgb_input_stride_y);
-    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s3);
-    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s4);
-    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s5);
-
-    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
-    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
-    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
-
-    i_y = convert_short2_rtz(f_y);
-    i_u = convert_short2_rtz(f_u) + (short2)(128);
-    i_v = convert_short2_rtz(f_v) + (short2)(128);
-
-    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
-    vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
-
-    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
-    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
-    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
-                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
-    *out_u.ptr = cbcr.x;
-    *out_v.ptr = cbcr.y;
-}
-
-/** Convert a RGBA image to YUV444 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  rgba_input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  rgba_input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  rgba_input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgba_input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  rgba_input_step_y                         rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgba_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination image V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- *
- */
-__kernel void RGBA8888_to_YUV444_bt709(
-    IMAGE_DECLARATION(rgba_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    // handle 4 pixels every time
-    Image in_rgba = CONVERT_TO_IMAGE_STRUCT(rgba_input);
-    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u   = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v   = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // Read 4 pixel
-    uchar16 rgb_0 = vload16(0, in_rgba.ptr);
-    uchar4  r_0   = (uchar4)(rgb_0.s0, rgb_0.s4, rgb_0.s8, rgb_0.sc);
-    uchar4  g_0   = (uchar4)(rgb_0.s1, rgb_0.s5, rgb_0.s9, rgb_0.sd);
-    uchar4  b_0   = (uchar4)(rgb_0.s2, rgb_0.s6, rgb_0.sa, rgb_0.se);
-
-    float4 f_y = (float4)(0.0000f) + (float4)(0.2126f) * convert_float4(r_0) + (float4)(0.7152f) * convert_float4(g_0) + (float4)(0.0722f) * convert_float4(b_0);
-    float4 f_u = (float4)(0.0000f) - (float4)(0.1146f) * convert_float4(r_0) - (float4)(0.3854f) * convert_float4(g_0) + (float4)(0.5000f) * convert_float4(b_0);
-    float4 f_v = (float4)(0.0000f) + (float4)(0.5000f) * convert_float4(r_0) - (float4)(0.4542f) * convert_float4(g_0) - (float4)(0.0458f) * convert_float4(b_0);
-
-    short4 i_y = convert_short4(f_y);
-    short4 i_u = convert_short4(f_u) + (short4)(128);
-    short4 i_v = convert_short4(f_v) + (short4)(128);
-
-    uchar4 luma_0 = convert_uchar4_sat(max((short4)(0), min(i_y, (short4)(255))));
-    vstore4(luma_0, 0, out_y.ptr);
-
-    uchar4 cb_0 = convert_uchar4_sat(max((short4)(0), min(i_u, (short4)(255))));
-    uchar4 cr_0 = convert_uchar4_sat(max((short4)(0), min(i_v, (short4)(255))));
-    vstore4(cb_0, 0, out_u.ptr);
-    vstore4(cr_0, 0, out_v.ptr);
-}
-
-/** Convert a RGBA image to NV12 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
- * No offset.
- *
- * @param[in]  input_ptr                                 Pointer to the source image. Supported Format: U8
- * @param[in]  input_stride_x                            Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                              input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                            Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                              input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes       The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination image luma channel
- * @param[out] uv_output_ptr                             Pointer to the destination uv channel. Supported Format: U8
- * @param[in]  uv_output_stride_x                        Stride of the destination uv channel in X dimension (in bytes)
- * @param[in]  uv_output_step_x                          uv_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_output_stride_y                        Stride of the destination image uv channel in Y dimension (in bytes)
- * @param[in]  uv_output_step_y                          uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_output_offset_first_element_in_bytes   The offset of the first element in the destination image uv channel
- *
- */
-__kernel void RGBA8888_to_NV12_bt709(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(uv_output))
-{
-    Image in     = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output);
-
-    // Read 2 pixel of the first line
-    uchar8 rgb_0 = vload8(0, in.ptr);
-    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s4);
-    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s5);
-    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s6);
-
-    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
-    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
-    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
-
-    short2 i_y = convert_short2_rtz(f_y);
-    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
-    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
-
-    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
-    vstore2(luma_0, 0, out_y.ptr);
-
-    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
-    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
-
-    // Read 2 pixel of the second line
-    uchar8 rgb_1 = vload8(0, in.ptr + input_stride_y);
-    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s4);
-    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s5);
-    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s6);
-
-    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
-    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
-    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
-
-    i_y = convert_short2_rtz(f_y);
-    i_u = convert_short2_rtz(f_u) + (short2)(128);
-    i_v = convert_short2_rtz(f_v) + (short2)(128);
-
-    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
-    vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
-
-    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
-    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
-    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
-                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
-    vstore2(cbcr, 0, out_uv.ptr);
-}
-
-/** Convert a RGBA image to IYUV using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
- * No offset.
- *
- * @param[in]  rgba_input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  rgba_input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  rgba_input_step_x                         rgba_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgba_input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  rgba_input_step_y                         rgba_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgba_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- *
- */
-__kernel void RGBA8888_to_IYUV_bt709(
-    IMAGE_DECLARATION(rgba_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    // handle 4 pixels every time, two lines, each line for 2 pixels
-    Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_input);
-    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u  = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v  = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // Read 2 pixel of the first line
-    uchar8 rgb_0 = vload8(0, in_rgb.ptr);
-    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s4);
-    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s5);
-    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s6);
-
-    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
-    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
-    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
-
-    short2 i_y = convert_short2_rtz(f_y);
-    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
-    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
-
-    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
-    vstore2(luma_0, 0, out_y.ptr);
-
-    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
-    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
-
-    // Read 2 pixel of the second line
-    uchar8 rgb_1 = vload8(0, in_rgb.ptr + rgba_input_stride_y);
-    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s4);
-    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s5);
-    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s6);
-
-    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
-    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
-    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
-
-    i_y = convert_short2_rtz(f_y);
-    i_u = convert_short2_rtz(f_u) + (short2)(128);
-    i_v = convert_short2_rtz(f_v) + (short2)(128);
-
-    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
-    vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
-
-    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
-    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
-    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
-                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
-    *out_u.ptr = cbcr.x;
-    *out_v.ptr = cbcr.y;
-}
-
-/** Convert an NV12 image to RGB8888
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  uv_input_ptr                             Pointer to the source uv channel. Supported Format: U8
- * @param[in]  uv_input_stride_x                        Stride of the source image uv channel in X dimension (in bytes)
- * @param[in]  uv_input_step_x                          uv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_input_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  uv_input_step_y                          uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_input_offset_first_element_in_bytes   The offset of the first element in the source image
- * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  rgb_output_step_y                        rgb_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void NV12_to_RGBA8888_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(uv_input),
-    IMAGE_DECLARATION(rgb_output))
-{
-    Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
-    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
-
-    uchar4 luma_0 = vload4(0, in_luma.ptr);
-    uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
-    uchar4 cbcr   = vload4(0, in_uv.ptr);
-    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
-    char4  cr     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
-
-    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
-    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
-    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
-
-    float4 f_r = convert_float4(luma_0) + temp0;
-    float4 f_g = convert_float4(luma_0) + temp1;
-    float4 f_b = convert_float4(luma_0) + temp2;
-
-    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
-    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
-    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
-
-    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
-    uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
-    vstore8(rgb_0, 0, out_rgb.ptr);
-    vstore8(rgb_1, 0, out_rgb.ptr + 8);
-
-    f_r = convert_float4(luma_1) + temp0;
-    f_g = convert_float4(luma_1) + temp1;
-    f_b = convert_float4(luma_1) + temp2;
-
-    r_0 = convert_uchar4_sat_rtz(f_r);
-    g_0 = convert_uchar4_sat_rtz(f_g);
-    b_0 = convert_uchar4_sat_rtz(f_b);
-
-    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
-    rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
-    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
-    vstore8(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
-}
-
-/** Convert an NV12 image to IYUV
- *
- * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
- * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
- * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- */
-__kernel void NV12_to_IYUV_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(uv_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
-    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // handle 32 pixels every time, two lines, each line for 16 pixels
-    uchar16 luma_0 = vload16(0, in_y.ptr);
-    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
-    uchar16 cbcr   = vload16(0, in_uv.ptr);
-    uchar8  cb     = (uchar8)(cbcr.s0, cbcr.s2, cbcr.s4, cbcr.s6, cbcr.s8, cbcr.sa, cbcr.sc, cbcr.se);
-    uchar8  cr     = (uchar8)(cbcr.s1, cbcr.s3, cbcr.s5, cbcr.s7, cbcr.s9, cbcr.sb, cbcr.sd, cbcr.sf);
-
-    vstore16(luma_0, 0, out_y.ptr);
-    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
-    vstore8(cb, 0, out_u.ptr);
-    vstore8(cr, 0, out_v.ptr);
-}
-
-/** Convert an NV12 image to YUV444
- *
- * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
- * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
- * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- */
-__kernel void NV12_to_YUV444_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(uv_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
-    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // handle 32 pixels every time, two lines, each line for 16 pixels
-    uchar16 luma_0 = vload16(0, in_y.ptr);
-    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
-    uchar16 cbcr   = vload16(0, in_uv.ptr);
-    uchar16 cb     = (uchar16)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2, cbcr.s4, cbcr.s4, cbcr.s6, cbcr.s6, cbcr.s8, cbcr.s8,
-                               cbcr.sa, cbcr.sa, cbcr.sc, cbcr.sc, cbcr.se, cbcr.se);
-    uchar16 cr = (uchar16)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3, cbcr.s5, cbcr.s5, cbcr.s7, cbcr.s7, cbcr.s9, cbcr.s9,
-                           cbcr.sb, cbcr.sb, cbcr.sd, cbcr.sd, cbcr.sf, cbcr.sf);
-
-    vstore16(luma_0, 0, out_y.ptr);
-    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
-    vstore16(cb, 0, out_u.ptr);
-    vstore16(cb, 0, out_u.ptr + u_output_stride_y);
-    vstore16(cr, 0, out_v.ptr);
-    vstore16(cr, 0, out_v.ptr + v_output_stride_y);
-}
-
-/** Convert an NV21 image to RGB888
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  uv_input_ptr                             Pointer to the source uv channel. Supported Format: U8
- * @param[in]  uv_input_stride_x                        Stride of the source image uv channel in X dimension (in bytes)
- * @param[in]  uv_input_step_x                          uv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_input_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  uv_input_step_y                          uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_input_offset_first_element_in_bytes   The offset of the first element in the source image
- * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  rgb_output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void NV21_to_RGB888_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(uv_input),
-    IMAGE_DECLARATION(rgb_output))
-{
-    Image in_y    = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
-    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
-
-    // handle 8 pixels every time, two lines, each line for 4 pixels
-    uchar4 luma_0 = vload4(0, in_y.ptr);
-    uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
-    uchar4 cbcr   = vload4(0, in_uv.ptr);
-    char4  cr     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
-    char4  cb     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
-
-    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
-    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
-    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
-
-    float4 f_r = convert_float4(luma_0) + temp0;
-    float4 f_g = convert_float4(luma_0) + temp1;
-    float4 f_b = convert_float4(luma_0) + temp2;
-
-    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
-    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
-    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
-
-    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
-    uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
-    vstore8(rgb_0, 0, out_rgb.ptr);
-    vstore4(rgb_1, 0, out_rgb.ptr + 8);
-
-    f_r = convert_float4(luma_1) + temp0;
-    f_g = convert_float4(luma_1) + temp1;
-    f_b = convert_float4(luma_1) + temp2;
-
-    r_0 = convert_uchar4_sat_rtz(f_r);
-    g_0 = convert_uchar4_sat_rtz(f_g);
-    b_0 = convert_uchar4_sat_rtz(f_b);
-
-    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
-    rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
-    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
-    vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
-}
-
-/** Convert an NV12 image to RGB8888
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
- * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
- * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] rgba_output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  rgba_output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  rgba_output_step_x                        rgba_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgba_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  rgba_output_step_y                        rgba_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgba_output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void NV21_to_RGBA8888_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(uv_input),
-    IMAGE_DECLARATION(rgba_output))
-{
-    Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
-    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_output);
-
-    // handle 8 pixels every time, two lines, each line for 4 pixels
-    uchar4 luma_0 = vload4(0, in_luma.ptr);
-    uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
-    uchar4 cbcr   = vload4(0, in_uv.ptr);
-    char4  cr     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
-    char4  cb     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
-
-    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
-    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
-    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
-
-    float4 f_r = convert_float4(luma_0) + temp0;
-    float4 f_g = convert_float4(luma_0) + temp1;
-    float4 f_b = convert_float4(luma_0) + temp2;
-
-    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
-    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
-    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
-
-    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
-    uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
-    vstore8(rgb_0, 0, out_rgb.ptr);
-    vstore8(rgb_1, 0, out_rgb.ptr + 8);
-
-    f_r = convert_float4(luma_1) + temp0;
-    f_g = convert_float4(luma_1) + temp1;
-    f_b = convert_float4(luma_1) + temp2;
-
-    r_0 = convert_uchar4_sat_rtz(f_r);
-    g_0 = convert_uchar4_sat_rtz(f_g);
-    b_0 = convert_uchar4_sat_rtz(f_b);
-
-    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
-    rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
-    vstore8(rgb_0, 0, out_rgb.ptr + rgba_output_stride_y);
-    vstore8(rgb_1, 0, out_rgb.ptr + rgba_output_stride_y + 8);
-}
-
-/** Convert an NV21 image to YUV444
- *
- * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
- * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
- * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- */
-__kernel void NV21_to_YUV444_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(uv_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
-    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // handle 32 pixels every time, two lines, each line for 16 pixels
-    uchar16 luma_0 = vload16(0, in_y.ptr);
-    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
-    uchar16 cbcr   = vload16(0, in_uv.ptr);
-    uchar16 cr     = (uchar16)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2, cbcr.s4, cbcr.s4, cbcr.s6, cbcr.s6, cbcr.s8, cbcr.s8,
-                               cbcr.sa, cbcr.sa, cbcr.sc, cbcr.sc, cbcr.se, cbcr.se);
-    uchar16 cb = (uchar16)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3, cbcr.s5, cbcr.s5, cbcr.s7, cbcr.s7, cbcr.s9, cbcr.s9,
-                           cbcr.sb, cbcr.sb, cbcr.sd, cbcr.sd, cbcr.sf, cbcr.sf);
-
-    vstore16(luma_0, 0, out_y.ptr);
-    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
-    vstore16(cb, 0, out_u.ptr);
-    vstore16(cb, 0, out_u.ptr + u_output_stride_y);
-    vstore16(cr, 0, out_v.ptr);
-    vstore16(cr, 0, out_v.ptr + v_output_stride_y);
-}
-
-/** Convert an NV21 image to IYUV
- *
- * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
- * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
- * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- */
-__kernel void NV21_to_IYUV_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(uv_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
-    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    uchar16 luma_0 = vload16(0, in_y.ptr);
-    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
-    uchar16 cbcr   = vload16(0, in_uv.ptr);
-    uchar8  cr     = (uchar8)(cbcr.s0, cbcr.s2, cbcr.s4, cbcr.s6, cbcr.s8, cbcr.sa, cbcr.sc, cbcr.se);
-    uchar8  cb     = (uchar8)(cbcr.s1, cbcr.s3, cbcr.s5, cbcr.s7, cbcr.s9, cbcr.sb, cbcr.sd, cbcr.sf);
-
-    vstore16(luma_0, 0, out_y.ptr);
-    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
-    vstore8(cb, 0, out_u.ptr);
-    vstore8(cr, 0, out_v.ptr);
-}
-
-/** Convert a UYVY image to IYUV using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
- * No offset.
- *
- * @param[in]  uyvy_input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  uyvy_input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  uyvy_input_step_x                         uyvy_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uyvy_input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  uyvy_input_step_y                         uyvy_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uyvy_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- *
- */
-__kernel void UYVY422_to_IYUV_bt709(
-    IMAGE_DECLARATION(uyvy_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    Image in_uyvy = CONVERT_TO_IMAGE_STRUCT(uyvy_input);
-    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u   = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v   = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // handle 16 pixels every time, each line 8 pixels
-    uchar16 uyvy = vload16(0, in_uyvy.ptr);
-    uchar8  luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
-    ushort4 cb_0 = (ushort4)(uyvy.s0, uyvy.s4, uyvy.s8, uyvy.sc);
-    ushort4 cr_0 = (ushort4)(uyvy.s2, uyvy.s6, uyvy.sa, uyvy.se);
-    vstore8(luma, 0, out_y.ptr);
-
-    uyvy         = vload16(0, in_uyvy.ptr + uyvy_input_stride_y);
-    luma         = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
-    ushort4 cb_1 = (ushort4)(uyvy.s0, uyvy.s4, uyvy.s8, uyvy.sc);
-    ushort4 cr_1 = (ushort4)(uyvy.s2, uyvy.s6, uyvy.sa, uyvy.se);
-    vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
-
-    uchar4 cb = convert_uchar4((cb_0 + cb_1) / (ushort4)(2));
-    uchar4 cr = convert_uchar4((cr_0 + cr_1) / (ushort4)(2));
-    vstore4(cb, 0, out_u.ptr);
-    vstore4(cr, 0, out_v.ptr);
-}
-
-/** Convert a YUYV image to IYUV using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
- * No offset.
- *
- * @param[in]  yuyv_input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  yuyv_input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  yuyv_input_step_x                         yuyv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  yuyv_input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  yuyv_input_step_y                         yuyv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  yuyv_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- *
- */
-__kernel void YUYV422_to_IYUV_bt709(
-    IMAGE_DECLARATION(yuyv_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    Image in_yuyv = CONVERT_TO_IMAGE_STRUCT(yuyv_input);
-    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u   = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v   = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // handle 16 pixels every time, each line 8 pixels
-    uchar16 yuyv = vload16(0, in_yuyv.ptr);
-    uchar8  luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
-    ushort4 cb_0 = (ushort4)(yuyv.s1, yuyv.s5, yuyv.s9, yuyv.sd);
-    ushort4 cr_0 = (ushort4)(yuyv.s3, yuyv.s7, yuyv.sb, yuyv.sf);
-    vstore8(luma, 0, out_y.ptr);
-
-    yuyv         = vload16(0, in_yuyv.ptr + yuyv_input_stride_y);
-    luma         = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
-    ushort4 cb_1 = (ushort4)(yuyv.s1, yuyv.s5, yuyv.s9, yuyv.sd);
-    ushort4 cr_1 = (ushort4)(yuyv.s3, yuyv.s7, yuyv.sb, yuyv.sf);
-    vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
-
-    uchar4 cb = convert_uchar4((cb_0 + cb_1) / (ushort4)(2));
-    uchar4 cr = convert_uchar4((cr_0 + cr_1) / (ushort4)(2));
-    vstore4(cb, 0, out_u.ptr);
-    vstore4(cr, 0, out_v.ptr);
-}
-
-/** Convert an IYUV image to RGB888
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  u_input_ptr                              Pointer to the source U channel. Supported Format: U8
- * @param[in]  u_input_stride_x                         Stride of the source image U channel in X dimension (in bytes)
- * @param[in]  u_input_step_x                           u_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_input_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  u_input_step_y                           u_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_input_offset_first_element_in_bytes    The offset of the first element in the source U channel
- * @param[in]  v_input_ptr                              Pointer to the source V channel. Supported Format: U8
- * @param[in]  v_input_stride_x                         Stride of the source image V channel in X dimension (in bytes)
- * @param[in]  v_input_step_x                           v_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_input_stride_y                         Stride of the source image V channel in Y dimension (in bytes)
- * @param[in]  v_input_step_y                           v_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_input_offset_first_element_in_bytes    The offset of the first element in the source image V channel
- * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  rgb_output_step_y                        rgb_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void IYUV_to_RGB888_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(u_input),
-    IMAGE_DECLARATION(v_input),
-    IMAGE_DECLARATION(rgb_output))
-{
-    Image in_y    = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_u    = CONVERT_TO_IMAGE_STRUCT(u_input);
-    Image in_v    = CONVERT_TO_IMAGE_STRUCT(v_input);
-    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
-
-    // handle 8 pixels every time, two lines, each line for 4 pixels
-    uchar4 luma_0 = vload4(0, in_y.ptr);
-    uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
-    uchar4 cbcr   = (uchar4)(vload2(0, in_u.ptr), vload2(0, in_v.ptr));
-    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s1, cbcr.s1) - (char4)(128);
-    char4  cr     = (char4)(cbcr.s2, cbcr.s2, cbcr.s3, cbcr.s3) - (char4)(128);
-
-    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
-    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
-    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
-
-    float4 f_r = convert_float4(luma_0) + temp0;
-    float4 f_g = convert_float4(luma_0) + temp1;
-    float4 f_b = convert_float4(luma_0) + temp2;
-
-    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
-    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
-    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
-
-    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
-    uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
-    vstore8(rgb_0, 0, out_rgb.ptr);
-    vstore4(rgb_1, 0, out_rgb.ptr + 8);
-
-    f_r = convert_float4(luma_1) + temp0;
-    f_g = convert_float4(luma_1) + temp1;
-    f_b = convert_float4(luma_1) + temp2;
-
-    r_0 = convert_uchar4_sat_rtz(f_r);
-    g_0 = convert_uchar4_sat_rtz(f_g);
-    b_0 = convert_uchar4_sat_rtz(f_b);
-
-    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
-    rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
-    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
-    vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
-}
-
-/** Convert an IYUV image to RGB8888
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  u_input_ptr                               Pointer to the source U channel. Supported Format: U8
- * @param[in]  u_input_stride_x                          Stride of the source image U channel in X dimension (in bytes)
- * @param[in]  u_input_step_x                            u_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_input_stride_y                          Stride of the source image in Y dimension (in bytes)
- * @param[in]  u_input_step_y                            u_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_input_offset_first_element_in_bytes     The offset of the first element in the source U channel
- * @param[in]  v_input_ptr                               Pointer to the source V channel. Supported Format: U8
- * @param[in]  v_input_stride_x                          Stride of the source image V channel in X dimension (in bytes)
- * @param[in]  v_input_step_x                            v_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_input_stride_y                          Stride of the source image V channel in Y dimension (in bytes)
- * @param[in]  v_input_step_y                            v_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_input_offset_first_element_in_bytes     The offset of the first element in the source image V channel
- * @param[out] rgba_output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  rgba_output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  rgba_output_step_x                        rgba_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgba_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  rgba_output_step_y                        rgba_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgba_output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void IYUV_to_RGBA8888_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(u_input),
-    IMAGE_DECLARATION(v_input),
-    IMAGE_DECLARATION(rgba_output))
-{
-    Image in_y    = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_u    = CONVERT_TO_IMAGE_STRUCT(u_input);
-    Image in_v    = CONVERT_TO_IMAGE_STRUCT(v_input);
-    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_output);
-
-    // handle 8 pixels every time, two lines, each line for 4 pixels
-    uchar4 luma_0 = vload4(0, in_y.ptr);
-    uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
-    uchar4 cbcr   = (uchar4)(vload2(0, in_u.ptr), vload2(0, in_v.ptr));
-    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s1, cbcr.s1) - (char4)(128);
-    char4  cr     = (char4)(cbcr.s2, cbcr.s2, cbcr.s3, cbcr.s3) - (char4)(128);
-
-    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
-    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
-    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
-
-    float4 f_r = convert_float4(luma_0) + temp0;
-    float4 f_g = convert_float4(luma_0) + temp1;
-    float4 f_b = convert_float4(luma_0) + temp2;
-
-    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
-    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
-    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
-
-    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
-    uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
-    vstore8(rgb_0, 0, out_rgb.ptr);
-    vstore8(rgb_1, 0, out_rgb.ptr + 8);
-
-    f_r = convert_float4(luma_1) + temp0;
-    f_g = convert_float4(luma_1) + temp1;
-    f_b = convert_float4(luma_1) + temp2;
-
-    r_0 = convert_uchar4_sat_rtz(f_r);
-    g_0 = convert_uchar4_sat_rtz(f_g);
-    b_0 = convert_uchar4_sat_rtz(f_b);
-
-    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
-    rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
-    vstore8(rgb_0, 0, out_rgb.ptr + rgba_output_stride_y);
-    vstore8(rgb_1, 0, out_rgb.ptr + rgba_output_stride_y + 8);
-}
-
-/** Convert an IYUV image to YUV444
- *
- * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  u_input_ptr                               Pointer to the source U channel. Supported Format: U8
- * @param[in]  u_input_stride_x                          Stride of the source image U channel in X dimension (in bytes)
- * @param[in]  u_input_step_x                            u_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_input_stride_y                          Stride of the source image in Y dimension (in bytes)
- * @param[in]  u_input_step_y                            u_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_input_offset_first_element_in_bytes     The offset of the first element in the source U channel
- * @param[in]  v_input_ptr                               Pointer to the source V channel. Supported Format: U8
- * @param[in]  v_input_stride_x                          Stride of the source image V channel in X dimension (in bytes)
- * @param[in]  v_input_step_x                            v_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_input_stride_y                          Stride of the source image V channel in Y dimension (in bytes)
- * @param[in]  v_input_step_y                            v_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_input_offset_first_element_in_bytes     The offset of the first element in the source image V channel
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- *
- */
-__kernel void IYUV_to_YUV444_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(u_input),
-    IMAGE_DECLARATION(v_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_u  = CONVERT_TO_IMAGE_STRUCT(u_input);
-    Image in_v  = CONVERT_TO_IMAGE_STRUCT(v_input);
-    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // handle 32 pixels every time, two lines, each line for 16 pixels
-    uchar16 luma_0 = vload16(0, in_y.ptr);
-    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
-    uchar8  cb_src = vload8(0, in_u.ptr);
-    uchar8  cr_src = vload8(0, in_v.ptr);
-    uchar16 cb     = (uchar16)(cb_src.s0, cb_src.s0, cb_src.s1, cb_src.s1, cb_src.s2, cb_src.s2, cb_src.s3, cb_src.s3,
-                               cb_src.s4, cb_src.s4, cb_src.s5, cb_src.s5, cb_src.s6, cb_src.s6, cb_src.s7, cb_src.s7);
-    uchar16 cr = (uchar16)(cr_src.s0, cr_src.s0, cr_src.s1, cr_src.s1, cr_src.s2, cr_src.s2, cr_src.s3, cr_src.s3,
-                           cr_src.s4, cr_src.s4, cr_src.s5, cr_src.s5, cr_src.s6, cr_src.s6, cr_src.s7, cr_src.s7);
-
-    vstore16(luma_0, 0, out_y.ptr);
-    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
-    vstore16(cb, 0, out_u.ptr);
-    vstore16(cb, 0, out_u.ptr + u_output_stride_y);
-    vstore16(cr, 0, out_v.ptr);
-    vstore16(cr, 0, out_v.ptr + v_output_stride_y);
-}
-
-/** Convert an IYUV image to NV12
- *
- * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  u_input_ptr                               Pointer to the source U channel. Supported Format: U8
- * @param[in]  u_input_stride_x                          Stride of the source image U channel in X dimension (in bytes)
- * @param[in]  u_input_step_x                            u_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_input_stride_y                          Stride of the source image in Y dimension (in bytes)
- * @param[in]  u_input_step_y                            u_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_input_offset_first_element_in_bytes     The offset of the first element in the source U channel
- * @param[in]  v_input_ptr                               Pointer to the source V channel. Supported Format: U8
- * @param[in]  v_input_stride_x                          Stride of the source image V channel in X dimension (in bytes)
- * @param[in]  v_input_step_x                            v_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_input_stride_y                          Stride of the source image V channel in Y dimension (in bytes)
- * @param[in]  v_input_step_y                            v_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_input_offset_first_element_in_bytes     The offset of the first element in the source image V channel
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] uv_output_ptr                             Pointer to the destination UV channel. Supported Format: U8
- * @param[in]  uv_output_stride_x                        Stride of the destination UV channel in X dimension (in bytes)
- * @param[in]  uv_output_step_x                          uv_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_output_stride_y                        Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  uv_output_step_y                          uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_output_offset_first_element_in_bytes   The offset of the first element in the destination UV channel
- *
- */
-__kernel void IYUV_to_NV12_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(u_input),
-    IMAGE_DECLARATION(v_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(uv_output))
-{
-    Image in_y   = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_u   = CONVERT_TO_IMAGE_STRUCT(u_input);
-    Image in_v   = CONVERT_TO_IMAGE_STRUCT(v_input);
-    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output);
-
-    // handle 32 pixels every time, two lines, each line for 16 pixels
-    uchar16 luma_0 = vload16(0, in_y.ptr);
-    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
-    uchar8  cb     = vload8(0, in_u.ptr);
-    uchar8  cr     = vload8(0, in_v.ptr);
-    uchar16 cbcr   = (uchar16)(cb.s0, cr.s0, cb.s1, cr.s1, cb.s2, cr.s2, cb.s3, cr.s3, cb.s4, cr.s4, cb.s5, cr.s5, cb.s6,
-                               cr.s6, cb.s7, cr.s7);
-
-    vstore16(luma_0, 0, out_y.ptr);
-    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
-    vstore16(cbcr, 0, out_uv.ptr);
-}
-
-/** Convert a YUYV image to NV12 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
- * No offset.
- *
- * @param[in]  yuyv_input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  yuyv_input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  yuyv_input_step_x                         yuyv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  yuyv_input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  yuyv_input_step_y                         yuyv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  yuyv_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] uv_output_ptr                             Pointer to the destination UV channel. Supported Format: U8
- * @param[in]  uv_output_stride_x                        Stride of the destination UV channel in X dimension (in bytes)
- * @param[in]  uv_output_step_x                          uv_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_output_stride_y                        Stride of the destination image UV channel in Y dimension (in bytes)
- * @param[in]  uv_output_step_y                          uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_output_offset_first_element_in_bytes   The offset of the first element in the destination UV channel
- *
- */
-__kernel void YUYV422_to_NV12_bt709(
-    IMAGE_DECLARATION(yuyv_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(uv_output))
-{
-    Image in_yuyv = CONVERT_TO_IMAGE_STRUCT(yuyv_input);
-    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_uv  = CONVERT_TO_IMAGE_STRUCT(uv_output);
-
-    // handle 16 pixels every time, each line 8 pixels
-    uchar16 yuyv   = vload16(0, in_yuyv.ptr);
-    ushort8 cbcr_0 = (ushort8)(yuyv.s1, yuyv.s3, yuyv.s5, yuyv.s7, yuyv.s9, yuyv.sb, yuyv.sd, yuyv.sf);
-    uchar8  luma   = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
-    vstore8(luma, 0, out_y.ptr);
-
-    yuyv           = vload16(0, in_yuyv.ptr + yuyv_input_stride_y);
-    ushort8 cbcr_1 = (ushort8)(yuyv.s1, yuyv.s3, yuyv.s5, yuyv.s7, yuyv.s9, yuyv.sb, yuyv.sd, yuyv.sf);
-    luma           = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
-    vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
-
-    uchar8 cbcr = convert_uchar8((cbcr_0 + cbcr_1) / (ushort8)(2));
-    vstore8(cbcr, 0, out_uv.ptr);
-}
-
-/** Convert a UYVY image to NV12 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  input_uyvy_ptr                           Pointer to the source image. Supported Format: U8
- * @param[in]  input_uyvy_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  input_uyvy_step_x                        input_uyvy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_uyvy_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_uyvy_step_y                        input_uyvy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_uyvy_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] luma_ptr                                 Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_stride_x                            Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_step_x                              luma_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_stride_y                            Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_step_y                              luma_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_offset_first_element_in_bytes       The offset of the first element in the destination image luma channel
- * @param[out] uv_ptr                                   Pointer to the destination uv channel. Supported Format: U8
- * @param[in]  uv_stride_x                              Stride of the destination uv channel in X dimension (in bytes)
- * @param[in]  uv_step_x                                uv_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_stride_y                              Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  uv_step_y                                uv_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_offset_first_element_in_bytes         The offset of the first element in the destination image uv channel
- *
- */
-__kernel void UYVY422_to_NV12_bt709(
-    IMAGE_DECLARATION(input_uyvy),
-    IMAGE_DECLARATION(luma),
-    IMAGE_DECLARATION(uv))
-{
-    Image in     = CONVERT_TO_IMAGE_STRUCT(input_uyvy);
-    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma);
-    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv);
-
-    // handle 16 pixels every time, each line 8 pixels
-    const uchar16 uyvy_t = vload16(0, in.ptr);
-    vstore8(uyvy_t.s13579bdf, 0, out_y.ptr);
-
-    const uchar16 uyvy_b = vload16(0, in.ptr + input_uyvy_stride_y);
-    vstore8(uyvy_b.s13579bdf, 0, out_y.ptr + luma_stride_y);
-
-    const ushort8 cbcr_t = (ushort8)(uyvy_t.s0, uyvy_t.s2, uyvy_t.s4, uyvy_t.s6, uyvy_t.s8, uyvy_t.sa, uyvy_t.sc, uyvy_t.se);
-    const ushort8 cbcr_b = (ushort8)(uyvy_b.s0, uyvy_b.s2, uyvy_b.s4, uyvy_b.s6, uyvy_b.s8, uyvy_b.sa, uyvy_b.sc, uyvy_b.se);
-    const uchar8  cbcr   = convert_uchar8((cbcr_t + cbcr_b) / (ushort8)(2));
-    vstore8(cbcr, 0, out_uv.ptr);
-}
diff --git a/src/core/CL/cl_kernels/convolution3x3.cl b/src/core/CL/cl_kernels/convolution3x3.cl
deleted file mode 100644
index 7bca567..0000000
--- a/src/core/CL/cl_kernels/convolution3x3.cl
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2016-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#ifndef DATA_TYPE
-#define DATA_TYPE short
-#endif /* DATA_TYPE */
-
-#ifndef DATA_TYPE_OUT
-#define DATA_TYPE_OUT uchar
-#endif /* DATA_TYPE_OUT */
-
-/** Compute a 1D horizontal convolution of size 3 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] left_pixel   Pointer to the left pixel.
- * @param[in] left_coeff   Weight of the left pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] right_coeff  Weight of the right pixel
- *
- * @return a short8 containing 8 convoluted values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) convolution1x3(__global const uchar *left_pixel,
-                                                  const short left_coeff,
-                                                  const short middle_coeff,
-                                                  const short right_coeff)
-{
-    uchar16 temp = vload16(0, left_pixel);
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    middle = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
-
-    return left * (VEC_DATA_TYPE(DATA_TYPE, 8))left_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right * (VEC_DATA_TYPE(DATA_TYPE, 8))right_coeff;
-}
-
-/** Apply a 3x3 convolution matrix to a single channel U8 input image and return the result.
- *
- * Convolution matrix layout:
- *
- * [ mat0, mat1, mat2 ]\n
- * [ mat3, mat4, mat5 ]\n
- * [ mat6, mat7, mat8 ]\n
- *
- * @param[in] src   A pointer to source Image structure
- * @param[in] mat0  Coefficient from the convolution matrix
- * @param[in] mat1  Coefficient from the convolution matrix
- * @param[in] mat2  Coefficient from the convolution matrix
- * @param[in] mat3  Coefficient from the convolution matrix
- * @param[in] mat4  Coefficient from the convolution matrix
- * @param[in] mat5  Coefficient from the convolution matrix
- * @param[in] mat6  Coefficient from the convolution matrix
- * @param[in] mat7  Coefficient from the convolution matrix
- * @param[in] mat8  Coefficient from the convolution matrix
- * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
- *
- * @return a short8 containing 8 convoluted and scaled values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) convolution3x3(
-    Image      *src,
-    const short mat0, const short mat1, const short mat2,
-    const short mat3, const short mat4, const short mat5,
-    const short mat6, const short mat7, const short mat8, uint scale)
-{
-    // Output pixels
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    pixels;
-
-    // Row 0
-    pixels = convolution1x3(offset(src, -1, -1), mat0, mat1, mat2);
-    // Row
-    pixels += convolution1x3(offset(src, -1, 0), mat3, mat4, mat5);
-    // Row 2
-    pixels += convolution1x3(offset(src, -1, 1), mat6, mat7, mat8);
-
-    // Divide by the scale
-    return pixels / (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
-}
-
-#ifndef DYNAMIC_MATRIX_CONVOLUTION
-
-/** Apply a 3x3 static convolution matrix to a single channel U8 input image and output a single channel image.
- *
- * @attention The matrix coefficients(MAT0, MAT1, ... MAT8, SCALE), DATA_TYPE, and DATA_TYPE_OUT need to be passed at compile time.\n
- * e.g. -DMAT0=1 -DMAT2=2, ...-DMAT8=8, -DSCALE=1, -DDATA_TYPE=int, -DDATA_TYPE_OUT=int
- *
- * @param[in]  src_ptr                           Pointer to the source image
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution3x3_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    pixels = convolution3x3(&src,
-                            MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, SCALE);
-
-    // Store the result as is in dst
-    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
-}
-
-#endif // DYNAMIC_MATRIX_CONVOLUTION
diff --git a/src/core/CL/cl_kernels/convolution5x5.cl b/src/core/CL/cl_kernels/convolution5x5.cl
deleted file mode 100644
index 9995ebf..0000000
--- a/src/core/CL/cl_kernels/convolution5x5.cl
+++ /dev/null
@@ -1,287 +0,0 @@
-/*
- * Copyright (c) 2016-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#ifndef DATA_TYPE
-#define DATA_TYPE short
-#endif /* DATA_TYPE */
-
-#ifndef COMPUTE_TYPE
-#define COMPUTE_TYPE int
-#endif /* COMPUTE_TYPE */
-
-#ifndef DATA_TYPE_OUT
-#define DATA_TYPE_OUT uchar
-#endif /* DATA_TYPE_OUT */
-
-/** Compute a 1D horizontal convolution of size 5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] left_pixel   Pointer to the left pixel
- * @param[in] left1_coeff  Weight of the most left pixel
- * @param[in] left2_coeff  Weight of the left pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] right1_coeff Weight of the right pixel
- * @param[in] right2_coeff Weight of the most right pixel
- *
- * @return a short8 containing 8 convoluted values.
- */
-VEC_DATA_TYPE(DATA_TYPE, 8)
-convolution1x5(
-    __global const uchar *left_pixel,
-    const short           left1_coeff,
-    const short           left2_coeff,
-    const short           middle_coeff,
-    const short           right1_coeff,
-    const short           right2_coeff)
-{
-    uchar16 temp = vload16(0, left_pixel);
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    middle = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right1 = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right2 = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8));
-
-    return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff
-           + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE, 8))right2_coeff;
-}
-
-/** Compute a 1D vertical convolution of size 5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] src          Pointer to source image.
- * @param[in] up1_coeff    Weight of the most up pixel
- * @param[in] up2_coeff    Weight of the up pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] down1_coeff  Weight of the down pixel
- * @param[in] down2_coeff  Weight of the most down pixel
- *
- * @return a short8 containing 8 convoluted values.
- */
-VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-convolution5x1(
-    Image      *src,
-    const short up1_coeff,
-    const short up2_coeff,
-    const short middle_coeff,
-    const short down1_coeff,
-    const short down2_coeff)
-{
-    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-    val;
-    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-    out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff;
-
-    return out;
-}
-
-/** Apply a 5x5 convolution matrix to a single channel U8 input image and return the result.
- *
- * Convolution matrix layout:\n
- * [  mat0,  mat1,  mat2,  mat3 , mat4 ]\n
- * [  mat5,  mat6,  mat7,  mat8,  mat9 ]\n
- * [ mat10, mat11, mat12, mat13, mat14 ]\n
- * [ mat15, mat16, mat17, mat18, mat19 ]\n
- * [ mat20, mat21, mat22, mat23, mat24 ]
- *
- * @param[in] src   A pointer to source Image structure.
- * @param[in] mat0  Coefficient from the convolution matrix
- * @param[in] mat1  Coefficient from the convolution matrix
- * @param[in] mat2  Coefficient from the convolution matrix
- * @param[in] mat3  Coefficient from the convolution matrix
- * @param[in] mat4  Coefficient from the convolution matrix
- * @param[in] mat5  Coefficient from the convolution matrix
- * @param[in] mat6  Coefficient from the convolution matrix
- * @param[in] mat7  Coefficient from the convolution matrix
- * @param[in] mat8  Coefficient from the convolution matrix
- * @param[in] mat9  Coefficient from the convolution matrix
- * @param[in] mat10 Coefficient from the convolution matrix
- * @param[in] mat11 Coefficient from the convolution matrix
- * @param[in] mat12 Coefficient from the convolution matrix
- * @param[in] mat13 Coefficient from the convolution matrix
- * @param[in] mat14 Coefficient from the convolution matrix
- * @param[in] mat15 Coefficient from the convolution matrix
- * @param[in] mat16 Coefficient from the convolution matrix
- * @param[in] mat17 Coefficient from the convolution matrix
- * @param[in] mat18 Coefficient from the convolution matrix
- * @param[in] mat19 Coefficient from the convolution matrix
- * @param[in] mat20 Coefficient from the convolution matrix
- * @param[in] mat21 Coefficient from the convolution matrix
- * @param[in] mat22 Coefficient from the convolution matrix
- * @param[in] mat23 Coefficient from the convolution matrix
- * @param[in] mat24 Coefficient from the convolution matrix
- * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
- *
- * @return a short8 containing 8 convoluted and scaled values.
- */
-short8 convolution5x5(
-    Image      *src,
-    const short mat0, const short mat1, const short mat2, const short mat3, const short mat4,
-    const short mat5, const short mat6, const short mat7, const short mat8, const short mat9,
-    const short mat10, const short mat11, const short mat12, const short mat13, const short mat14,
-    const short mat15, const short mat16, const short mat17, const short mat18, const short mat19,
-    const short mat20, const short mat21, const short mat22, const short mat23, const short mat24,
-    uint scale)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    pixels;
-
-    pixels = convolution1x5(offset(src, -2, -2), mat0, mat1, mat2, mat3, mat4);
-    pixels += convolution1x5(offset(src, -2, -1), mat5, mat6, mat7, mat8, mat9);
-    pixels += convolution1x5(offset(src, -2, 0), mat10, mat11, mat12, mat13, mat14);
-    pixels += convolution1x5(offset(src, -2, 1), mat15, mat16, mat17, mat18, mat19);
-    pixels += convolution1x5(offset(src, -2, 2), mat20, mat21, mat22, mat23, mat24);
-
-    if(scale > 0)
-    {
-        pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
-    }
-
-    return convert_short8_sat(pixels);
-}
-
-#ifndef DYNAMIC_MATRIX_CONVOLUTION
-
-/** Apply a 1x5 static convolution matrix to a single channel U8 input image and output a single temporary channel image(Support U16, S16, S32).
- *
- * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4) and DATA_TYPE need to be passed at compile time:\n
- * e.g. -DMAT0=1 -DMAT2=2, -DMAT3=3, -DMAT4=4, -DDATA_TYPE=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16, S16, S32
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution_separable1x5_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Output pixels
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    pixels = convolution1x5(offset(&src, -2, 0), MAT0, MAT1, MAT2, MAT3, MAT4);
-
-    // Store result in dst
-    vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr);
-}
-
-/** Apply a 5x1 static convolution matrix to a single channel U8 input image and output a single channel image.
- *
- * @attention The matrix coefficients (MAT5, MAT6, MAT7, MAT8, MAT9, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n
- * e.g. -DMAT5=1 -DMAT6=2, -DMAT7=3, -DMAT8=4, -DMAT9=5, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16, S16, S32
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution_separable5x1_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Output pixels
-    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-    pixels = convolution5x1(&src, MAT5, MAT6, MAT7, MAT8, MAT9);
-
-    // Divide by the scale
-    pixels /= (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE;
-
-    // Store result in dst
-    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
-}
-
-/** Apply a static 5x5 convolution matrix to a single channel U8 input image and output a single channel image including borders
- *
- * @attention The matrix coefficients(MAT0, MAT1, ... MAT24, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n
- * e.g. -DMAT0=1 -DMAT1=2, ... -DMAT24=24, -DSCALE=6, -DDATA_TYPE_OUT=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution5x5_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    short8 pixels = convolution5x5(&src,
-                                   MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13,
-                                   MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, SCALE);
-
-    // Store the result as is in dst
-    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
-}
-
-#endif // DYNAMIC_MATRIX_CONVOLUTION
diff --git a/src/core/CL/cl_kernels/convolution7x7.cl b/src/core/CL/cl_kernels/convolution7x7.cl
deleted file mode 100644
index 50fb3d7..0000000
--- a/src/core/CL/cl_kernels/convolution7x7.cl
+++ /dev/null
@@ -1,338 +0,0 @@
-/*
- * Copyright (c) 2016-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#ifndef DATA_TYPE
-#define DATA_TYPE short
-#endif /* DATA_TYPE */
-
-#ifndef COMPUTE_TYPE
-#define COMPUTE_TYPE int
-#endif /* COMPUTE_TYPE */
-
-#ifndef DATA_TYPE_OUT
-#define DATA_TYPE_OUT uchar
-#endif /* DATA_TYPE_OUT */
-
-/** Compute a 1D horizontal convolution of size 7 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] left_pixel   Pointer to the left pixel
- * @param[in] left1_coeff  Weight of the most left pixel
- * @param[in] left2_coeff  Weight of the second left pixel
- * @param[in] left3_coeff  Weight of the left pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] right1_coeff Weight of the right pixel
- * @param[in] right2_coeff Weight of the second right pixel
- * @param[in] right3_coeff Weight of the most right pixel
- *
- * @return a short8 containing 8 convoluted values.
- */
-VEC_DATA_TYPE(DATA_TYPE, 8)
-convolution1x7(
-    __global const uchar *left_pixel,
-    const short           left1_coeff,
-    const short           left2_coeff,
-    const short           left3_coeff,
-    const short           middle_coeff,
-    const short           right1_coeff,
-    const short           right2_coeff,
-    const short           right3_coeff)
-{
-    uchar16 temp = vload16(0, left_pixel);
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left3 = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    middle = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right1 = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right2 = CONVERT(temp.s56789abc, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right3 = CONVERT(temp.s6789abcd, VEC_DATA_TYPE(DATA_TYPE, 8));
-
-    return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff + left3 * (VEC_DATA_TYPE(DATA_TYPE, 8))left3_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE,
-            8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE, 8))right2_coeff + right3 * (VEC_DATA_TYPE(DATA_TYPE, 8))right3_coeff;
-}
-
-/** Compute a 1D vertical convolution of size 7 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] src          Pointer to source image.
- * @param[in] up1_coeff    Weight of the most up pixel
- * @param[in] up2_coeff    Weight of the second up pixel
- * @param[in] up3_coeff    Weight of the up pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] down1_coeff  Weight of the down pixel
- * @param[in] down2_coeff  Weight of the second down pixel
- * @param[in] down3_coeff  Weight of the third down pixel
- *
- * @return a short8 containing 8 convoluted values.
- */
-VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-convolution7x1(
-    Image      *src,
-    const short up1_coeff,
-    const short up2_coeff,
-    const short up3_coeff,
-    const short middle_coeff,
-    const short down1_coeff,
-    const short down2_coeff,
-    const short down3_coeff)
-{
-    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-    val;
-    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-    out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up3_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down3_coeff;
-
-    return out;
-}
-
-/** Apply a 7x7 convolution matrix to a single channel U8 input image and return the result.
- *
- * Convolution matrix layout:\n
- * [  mat0,  mat1,  mat2,  mat3 , mat4,  mat5,  mat6 ]\n
- * [  mat7,  mat8,  mat9,  mat10, mat11, mat12, mat13 ]\n
- * [  mat14, mat15, mat16, mat17, mat18, mat19, mat20 ]\n
- * [  mat21, mat22, mat23, mat24, mat25, mat26, mat27 ]\n
- * [  mat28, mat29, mat30, mat31, mat32, mat33, mat34 ]\n
- * [  mat35, mat36, mat37, mat38, mat39, mat40, mat41 ]\n
- * [  mat42, mat43, mat44, mat45, mat46, mat47, mat48 ]
- *
- * @param[in] src   A pointer to source Image structure.
- * @param[in] mat0  Coefficient from the convolution matrix
- * @param[in] mat1  Coefficient from the convolution matrix
- * @param[in] mat2  Coefficient from the convolution matrix
- * @param[in] mat3  Coefficient from the convolution matrix
- * @param[in] mat4  Coefficient from the convolution matrix
- * @param[in] mat5  Coefficient from the convolution matrix
- * @param[in] mat6  Coefficient from the convolution matrix
- * @param[in] mat7  Coefficient from the convolution matrix
- * @param[in] mat8  Coefficient from the convolution matrix
- * @param[in] mat9  Coefficient from the convolution matrix
- * @param[in] mat10 Coefficient from the convolution matrix
- * @param[in] mat11 Coefficient from the convolution matrix
- * @param[in] mat12 Coefficient from the convolution matrix
- * @param[in] mat13 Coefficient from the convolution matrix
- * @param[in] mat14 Coefficient from the convolution matrix
- * @param[in] mat15 Coefficient from the convolution matrix
- * @param[in] mat16 Coefficient from the convolution matrix
- * @param[in] mat17 Coefficient from the convolution matrix
- * @param[in] mat18 Coefficient from the convolution matrix
- * @param[in] mat19 Coefficient from the convolution matrix
- * @param[in] mat20 Coefficient from the convolution matrix
- * @param[in] mat21 Coefficient from the convolution matrix
- * @param[in] mat22 Coefficient from the convolution matrix
- * @param[in] mat23 Coefficient from the convolution matrix
- * @param[in] mat24 Coefficient from the convolution matrix
- * @param[in] mat25 Coefficient from the convolution matrix
- * @param[in] mat26 Coefficient from the convolution matrix
- * @param[in] mat27 Coefficient from the convolution matrix
- * @param[in] mat28 Coefficient from the convolution matrix
- * @param[in] mat29 Coefficient from the convolution matrix
- * @param[in] mat30 Coefficient from the convolution matrix
- * @param[in] mat31 Coefficient from the convolution matrix
- * @param[in] mat32 Coefficient from the convolution matrix
- * @param[in] mat33 Coefficient from the convolution matrix
- * @param[in] mat34 Coefficient from the convolution matrix
- * @param[in] mat35 Coefficient from the convolution matrix
- * @param[in] mat36 Coefficient from the convolution matrix
- * @param[in] mat37 Coefficient from the convolution matrix
- * @param[in] mat38 Coefficient from the convolution matrix
- * @param[in] mat39 Coefficient from the convolution matrix
- * @param[in] mat40 Coefficient from the convolution matrix
- * @param[in] mat41 Coefficient from the convolution matrix
- * @param[in] mat42 Coefficient from the convolution matrix
- * @param[in] mat43 Coefficient from the convolution matrix
- * @param[in] mat44 Coefficient from the convolution matrix
- * @param[in] mat45 Coefficient from the convolution matrix
- * @param[in] mat46 Coefficient from the convolution matrix
- * @param[in] mat47 Coefficient from the convolution matrix
- * @param[in] mat48 Coefficient from the convolution matrix
- * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
- *
- */
-short8 convolution7x7(
-    Image      *src,
-    const short mat0, const short mat1, const short mat2, const short mat3, const short mat4,
-    const short mat5, const short mat6, const short mat7, const short mat8, const short mat9,
-    const short mat10, const short mat11, const short mat12, const short mat13, const short mat14,
-    const short mat15, const short mat16, const short mat17, const short mat18, const short mat19,
-    const short mat20, const short mat21, const short mat22, const short mat23, const short mat24,
-    const short mat25, const short mat26, const short mat27, const short mat28, const short mat29,
-    const short mat30, const short mat31, const short mat32, const short mat33, const short mat34,
-    const short mat35, const short mat36, const short mat37, const short mat38, const short mat39,
-    const short mat40, const short mat41, const short mat42, const short mat43, const short mat44,
-    const short mat45, const short mat46, const short mat47, const short mat48, uint scale)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    pixels;
-
-    pixels = convolution1x7(offset(src, -3, -3), mat0, mat1, mat2, mat3, mat4, mat5, mat6);
-    pixels += convolution1x7(offset(src, -3, -2), mat7, mat8, mat9, mat10, mat11, mat12, mat13);
-    pixels += convolution1x7(offset(src, -3, -1), mat14, mat15, mat16, mat17, mat18, mat19, mat20);
-    pixels += convolution1x7(offset(src, -3, 0), mat21, mat22, mat23, mat24, mat25, mat26, mat27);
-    pixels += convolution1x7(offset(src, -3, 1), mat28, mat29, mat30, mat31, mat32, mat33, mat34);
-    pixels += convolution1x7(offset(src, -3, 2), mat35, mat36, mat37, mat38, mat39, mat40, mat41);
-    pixels += convolution1x7(offset(src, -3, 3), mat42, mat43, mat44, mat45, mat46, mat47, mat48);
-
-    if(scale > 0)
-    {
-        pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
-    }
-
-    return convert_short8_sat(pixels);
-}
-
-#ifndef DYNAMIC_MATRIX_CONVOLUTION
-
-/** Apply a 1x7 static convolution matrix to a single channel U8 input image and output a single temporary channel image.
- *
- * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6) and DATA_TYPE need to be passed at compile time:\n
- * e.g. -DMAT0=1 -DMAT1=2, ... -DMAT6=6, -DDATA_TYPE=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16, S16, S32
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution_separable1x7_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Output pixels
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    pixels = convolution1x7(offset(&src, -3, 0), MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6);
-
-    // Store result in dst
-    vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr);
-}
-
-/** Apply a 7x1 static convolution matrix to a single channel U8 input image and output a single channel image.
- *
- * @attention The matrix coefficients (MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n
- * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT24=13, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16, S16, S32
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution_separable7x1_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Output pixels
-    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-    pixels = convolution7x1(&src, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13);
-
-    // Divide by the scale
-    pixels /= (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE;
-
-    // Store result in dst
-    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
-}
-
-/** Apply a static 7x7 convolution matrix to a single channel U8 input image and output a single channel U8 image including the borders.
- *
- * @attention The matrix coefficients(MAT0, MAT1, ... MAT48, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n
- * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT48=48, -DSCALE=6, -DDATA_TYPE_OUT=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution7x7_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    short8 pixels = convolution7x7(&src,
-                                   MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13,
-                                   MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25,
-                                   MAT26, MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35, MAT36, MAT37,
-                                   MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44, MAT45, MAT46, MAT47, MAT48, SCALE);
-
-    // Clamp results to [ 0, 255 ] and store them in dst
-    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
-}
-
-#endif // DYNAMIC_MATRIX_CONVOLUTION
diff --git a/src/core/CL/cl_kernels/convolution9x9.cl b/src/core/CL/cl_kernels/convolution9x9.cl
deleted file mode 100644
index 7e77c61..0000000
--- a/src/core/CL/cl_kernels/convolution9x9.cl
+++ /dev/null
@@ -1,403 +0,0 @@
-/*
- * Copyright (c) 2016-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#ifndef DATA_TYPE
-#define DATA_TYPE short
-#endif /* DATA_TYPE */
-
-#ifndef COMPUTE_TYPE
-#define COMPUTE_TYPE int
-#endif /* COMPUTE_TYPE */
-
-#ifndef DATA_TYPE_OUT
-#define DATA_TYPE_OUT uchar
-#endif /* DATA_TYPE_OUT */
-
-/** Compute a 1D horizontal convolution of size 9 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] left_pixel   Pointer to the left pixel
- * @param[in] left1_coeff  Weight of the most left pixel
- * @param[in] left2_coeff  Weight of the second left pixel
- * @param[in] left3_coeff  Weight of the third left pixel
- * @param[in] left4_coeff  Weight of the left pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] right1_coeff Weight of the right pixel
- * @param[in] right2_coeff Weight of the second right pixel
- * @param[in] right3_coeff Weight of the third right pixel
- * @param[in] right4_coeff Weight of the most right pixel
- *
- * @return a short8 containing 8 convoluted values.
- */
-VEC_DATA_TYPE(DATA_TYPE, 8)
-convolution1x9(
-    __global const uchar *left_pixel,
-    const short           left1_coeff,
-    const short           left2_coeff,
-    const short           left3_coeff,
-    const short           left4_coeff,
-    const short           middle_coeff,
-    const short           right1_coeff,
-    const short           right2_coeff,
-    const short           right3_coeff,
-    const short           right4_coeff)
-{
-    uchar16 temp = vload16(0, left_pixel);
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left3 = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left4 = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    middle = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right1 = CONVERT(temp.s56789abc, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right2 = CONVERT(temp.s6789abcd, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right3 = CONVERT(temp.s789abcde, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right4 = CONVERT(temp.s89abcdef, VEC_DATA_TYPE(DATA_TYPE, 8));
-
-    return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff + left3 * (VEC_DATA_TYPE(DATA_TYPE, 8))left3_coeff + left4 * (VEC_DATA_TYPE(DATA_TYPE,
-            8))left4_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE,
-                    8))right2_coeff + right3 * (VEC_DATA_TYPE(DATA_TYPE, 8))right3_coeff + right4 * (VEC_DATA_TYPE(DATA_TYPE, 8))right4_coeff;
-}
-
-/** Compute a 1D vertical convolution of size 9 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] src          Pointer to source image.
- * @param[in] up1_coeff    Weight of the most up pixel
- * @param[in] up2_coeff    Weight of the second up pixel
- * @param[in] up3_coeff    Weight of the third up pixel
- * @param[in] up4_coeff    Weight of the up pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] down1_coeff  Weight of the down pixel
- * @param[in] down2_coeff  Weight of the second down pixel
- * @param[in] down3_coeff  Weight of the third down pixel
- * @param[in] down4_coeff  Weight of the most down pixel
- *
- * @return a short8 containing 8 convoluted values.
- */
-VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-convolution9x1(
-    Image      *src,
-    const short up1_coeff,
-    const short up2_coeff,
-    const short up3_coeff,
-    const short up4_coeff,
-    const short middle_coeff,
-    const short down1_coeff,
-    const short down2_coeff,
-    const short down3_coeff,
-    const short down4_coeff)
-{
-    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-    val;
-    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-    out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -4)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up3_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up4_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down3_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 4)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down4_coeff;
-
-    return out;
-}
-
-/** Apply a 9x9 convolution matrix to a single channel U8 input image and return the result.
- *
- * Convolution matrix layout:\n
- * [  mat0,  mat1,  mat2,  mat3 , mat4,  mat5,  mat6,  mat7, mat8 ]\n
- * [  mat9,  mat10, mat11, mat12, mat13, mat14, mat15, mat16, mat17 ]\n
- * [  mat18, mat19, mat20, mat21, mat22, mat23, mat24, mat25, mat26 ]\n
- * [  mat27, mat28, mat29, mat30, mat31, mat32, mat33, mat34, mat35 ]\n
- * [  mat36, mat37, mat38, mat39, mat40, mat41, mat42, mat43, mat44 ]\n
- * [  mat45, mat46, mat47, mat48, mat49, mat50, mat51, mat52, mat53 ]\n
- * [  mat54, mat55, mat56, mat57, mat58, mat59, mat60, mat61, mat62 ]
- * [  mat63, mat64, mat65, mat66, mat67, mat68, mat69, mat70, mat71 ]
- * [  mat72, mat73, mat74, mat75, mat76, mat77, mat78, mat79, mat80 ]
- *
- * @param[in] src   A pointer to source Image structure.
- * @param[in] mat0  Coefficient from the convolution matrix
- * @param[in] mat1  Coefficient from the convolution matrix
- * @param[in] mat2  Coefficient from the convolution matrix
- * @param[in] mat3  Coefficient from the convolution matrix
- * @param[in] mat4  Coefficient from the convolution matrix
- * @param[in] mat5  Coefficient from the convolution matrix
- * @param[in] mat6  Coefficient from the convolution matrix
- * @param[in] mat7  Coefficient from the convolution matrix
- * @param[in] mat8  Coefficient from the convolution matrix
- * @param[in] mat9  Coefficient from the convolution matrix
- * @param[in] mat10 Coefficient from the convolution matrix
- * @param[in] mat11 Coefficient from the convolution matrix
- * @param[in] mat12 Coefficient from the convolution matrix
- * @param[in] mat13 Coefficient from the convolution matrix
- * @param[in] mat14 Coefficient from the convolution matrix
- * @param[in] mat15 Coefficient from the convolution matrix
- * @param[in] mat16 Coefficient from the convolution matrix
- * @param[in] mat17 Coefficient from the convolution matrix
- * @param[in] mat18 Coefficient from the convolution matrix
- * @param[in] mat19 Coefficient from the convolution matrix
- * @param[in] mat20 Coefficient from the convolution matrix
- * @param[in] mat21 Coefficient from the convolution matrix
- * @param[in] mat22 Coefficient from the convolution matrix
- * @param[in] mat23 Coefficient from the convolution matrix
- * @param[in] mat24 Coefficient from the convolution matrix
- * @param[in] mat25 Coefficient from the convolution matrix
- * @param[in] mat26 Coefficient from the convolution matrix
- * @param[in] mat27 Coefficient from the convolution matrix
- * @param[in] mat28 Coefficient from the convolution matrix
- * @param[in] mat29 Coefficient from the convolution matrix
- * @param[in] mat30 Coefficient from the convolution matrix
- * @param[in] mat31 Coefficient from the convolution matrix
- * @param[in] mat32 Coefficient from the convolution matrix
- * @param[in] mat33 Coefficient from the convolution matrix
- * @param[in] mat34 Coefficient from the convolution matrix
- * @param[in] mat35 Coefficient from the convolution matrix
- * @param[in] mat36 Coefficient from the convolution matrix
- * @param[in] mat37 Coefficient from the convolution matrix
- * @param[in] mat38 Coefficient from the convolution matrix
- * @param[in] mat39 Coefficient from the convolution matrix
- * @param[in] mat40 Coefficient from the convolution matrix
- * @param[in] mat41 Coefficient from the convolution matrix
- * @param[in] mat42 Coefficient from the convolution matrix
- * @param[in] mat43 Coefficient from the convolution matrix
- * @param[in] mat44 Coefficient from the convolution matrix
- * @param[in] mat45 Coefficient from the convolution matrix
- * @param[in] mat46 Coefficient from the convolution matrix
- * @param[in] mat47 Coefficient from the convolution matrix
- * @param[in] mat48 Coefficient from the convolution matrix
- * @param[in] mat49 Coefficient from the convolution matrix
- * @param[in] mat50 Coefficient from the convolution matrix
- * @param[in] mat51 Coefficient from the convolution matrix
- * @param[in] mat52 Coefficient from the convolution matrix
- * @param[in] mat53 Coefficient from the convolution matrix
- * @param[in] mat54 Coefficient from the convolution matrix
- * @param[in] mat55 Coefficient from the convolution matrix
- * @param[in] mat56 Coefficient from the convolution matrix
- * @param[in] mat57 Coefficient from the convolution matrix
- * @param[in] mat58 Coefficient from the convolution matrix
- * @param[in] mat59 Coefficient from the convolution matrix
- * @param[in] mat60 Coefficient from the convolution matrix
- * @param[in] mat61 Coefficient from the convolution matrix
- * @param[in] mat62 Coefficient from the convolution matrix
- * @param[in] mat63 Coefficient from the convolution matrix
- * @param[in] mat64 Coefficient from the convolution matrix
- * @param[in] mat65 Coefficient from the convolution matrix
- * @param[in] mat66 Coefficient from the convolution matrix
- * @param[in] mat67 Coefficient from the convolution matrix
- * @param[in] mat68 Coefficient from the convolution matrix
- * @param[in] mat69 Coefficient from the convolution matrix
- * @param[in] mat70 Coefficient from the convolution matrix
- * @param[in] mat71 Coefficient from the convolution matrix
- * @param[in] mat72 Coefficient from the convolution matrix
- * @param[in] mat73 Coefficient from the convolution matrix
- * @param[in] mat74 Coefficient from the convolution matrix
- * @param[in] mat75 Coefficient from the convolution matrix
- * @param[in] mat76 Coefficient from the convolution matrix
- * @param[in] mat77 Coefficient from the convolution matrix
- * @param[in] mat78 Coefficient from the convolution matrix
- * @param[in] mat79 Coefficient from the convolution matrix
- * @param[in] mat80 Coefficient from the convolution matrix
- * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
- *
- */
-short8 convolution9x9(
-    Image      *src,
-    const short mat0, const short mat1, const short mat2, const short mat3, const short mat4,
-    const short mat5, const short mat6, const short mat7, const short mat8, const short mat9,
-    const short mat10, const short mat11, const short mat12, const short mat13, const short mat14,
-    const short mat15, const short mat16, const short mat17, const short mat18, const short mat19,
-    const short mat20, const short mat21, const short mat22, const short mat23, const short mat24,
-    const short mat25, const short mat26, const short mat27, const short mat28, const short mat29,
-    const short mat30, const short mat31, const short mat32, const short mat33, const short mat34,
-    const short mat35, const short mat36, const short mat37, const short mat38, const short mat39,
-    const short mat40, const short mat41, const short mat42, const short mat43, const short mat44,
-    const short mat45, const short mat46, const short mat47, const short mat48, const short mat49,
-    const short mat50, const short mat51, const short mat52, const short mat53, const short mat54,
-    const short mat55, const short mat56, const short mat57, const short mat58, const short mat59,
-    const short mat60, const short mat61, const short mat62, const short mat63, const short mat64,
-    const short mat65, const short mat66, const short mat67, const short mat68, const short mat69,
-    const short mat70, const short mat71, const short mat72, const short mat73, const short mat74,
-    const short mat75, const short mat76, const short mat77, const short mat78, const short mat79,
-    const short mat80, uint scale)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    pixels;
-
-    pixels = convolution1x9(offset(src, -4, -4), mat0, mat1, mat2, mat3, mat4, mat5, mat6, mat7, mat8);
-    pixels += convolution1x9(offset(src, -4, -3), mat9, mat10, mat11, mat12, mat13, mat14, mat15, mat16, mat17);
-    pixels += convolution1x9(offset(src, -4, -2), mat18, mat19, mat20, mat21, mat22, mat23, mat24, mat25, mat26);
-    pixels += convolution1x9(offset(src, -4, -1), mat27, mat28, mat29, mat30, mat31, mat32, mat33, mat34, mat35);
-    pixels += convolution1x9(offset(src, -4, 0), mat36, mat37, mat38, mat39, mat40, mat41, mat42, mat43, mat44);
-    pixels += convolution1x9(offset(src, -4, 1), mat45, mat46, mat47, mat48, mat49, mat50, mat51, mat52, mat53);
-    pixels += convolution1x9(offset(src, -4, 2), mat54, mat55, mat56, mat57, mat58, mat59, mat60, mat61, mat62);
-    pixels += convolution1x9(offset(src, -4, 3), mat63, mat64, mat65, mat66, mat67, mat68, mat69, mat70, mat71);
-    pixels += convolution1x9(offset(src, -4, 4), mat72, mat73, mat74, mat75, mat76, mat77, mat78, mat79, mat80);
-
-    if(scale > 0)
-    {
-        pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
-    }
-
-    return convert_short8_sat(pixels);
-}
-
-#ifndef DYNAMIC_MATRIX_CONVOLUTION
-
-/** Apply a 1x9 static convolution matrix to a single channel U8 input image and output a single temporary channel image.
- *
- * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8) and DATA_TYPE need to be passed at compile time:\n
- * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT8=8, -DCOMPUTE_TYPE=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16, S16, S32
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution_separable1x9_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Output pixels
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    pixels = convolution1x9(offset(&src, -4, 0), MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8);
-
-    // Store result in dst
-    vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr);
-}
-
-/** Apply a 9x1 static convolution matrix to a single channel U8 input image and output a single channel image.
- *
- * @attention The matrix coefficients (MAT9, MAT10, ... MAT17, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n
- * e.g. -DMAT9=9 -DMAT10=10, ... -DMAT17=17, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16, S16, S32
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution_separable9x1_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Output pixels
-    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-    pixels = convolution9x1(&src, MAT9, MAT10, MAT11, MAT12, MAT13, MAT14, MAT15, MAT16, MAT17);
-
-    // Divide by the scale
-    pixels = pixels / (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE;
-
-    // Store result in dst
-    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
-}
-
-/** Apply a static 9x9 convolution matrix to a single channel U8 input image and output a single channel image including borders
- *
- * @attention The matrix coefficients(MAT0, MAT1, ... MAT80, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n
- * e.g. -DMAT0=0 -DMAT1=1, ... -DMAT80=80, -DSCALE=6, -DDATA_TYPE_OUT=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution9x9_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    short8 pixels = convolution9x9(&src,
-                                   MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13,
-                                   MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25,
-                                   MAT26, MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35, MAT36, MAT37,
-                                   MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44, MAT45, MAT46, MAT47, MAT48, MAT49,
-                                   MAT50, MAT51, MAT52, MAT53, MAT54, MAT55, MAT56, MAT57, MAT58, MAT59, MAT60, MAT61,
-                                   MAT62, MAT63, MAT64, MAT65, MAT66, MAT67, MAT68, MAT69, MAT70, MAT71, MAT72, MAT73,
-                                   MAT74, MAT75, MAT76, MAT77, MAT78, MAT79, MAT80, SCALE);
-
-    // Store the result as is in dst
-    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
-}
-
-#endif // DYNAMIC_MATRIX_CONVOLUTION
diff --git a/src/core/CL/cl_kernels/convolution_rectangle.cl b/src/core/CL/cl_kernels/convolution_rectangle.cl
deleted file mode 100644
index 925a698..0000000
--- a/src/core/CL/cl_kernels/convolution_rectangle.cl
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "convolution3x3.cl"
-#include "convolution5x5.cl"
-#include "convolution7x7.cl"
-#include "convolution9x9.cl"
-#include "helpers.h"
-
-#define MAT_INDEX(i) MAT##i
-
-#ifndef DATA_TYPE
-#define DATA_TYPE short
-#endif /* DATA_TYPE */
-
-#ifndef COMPUTE_TYPE
-#define COMPUTE_TYPE int
-#endif /* COMPUTE_TYPE */
-
-#ifndef DATA_TYPE_OUT
-#define DATA_TYPE_OUT uchar
-#endif /* DATA_TYPE_OUT */
-
-#ifndef DYNAMIC_MATRIX_CONVOLUTION
-
-/** Apply a rectangle matrix to a single channel U8 input image and output a single channel image including borders
- *
- * @attention The matrix coefficients(MAT0, MAT1, ... MAT80, SCALE), MATRIX_WIDTH, MATRIX_HEIGHT, COMPUTE_TYPE, DATA_TYPE, DATA_TYPE_OUT need to be passed at compile time:\n
- * e.g. -DMAT0=0 -DMAT1=1, ... -DMAT80=80, -DSCALE=6, -DMATRIX_WIDTH=3, -DMATRIX_HEIGHT=5, -DCOMPUTE_TYPE=int, -DDATA_TYPE=int, -DDATA_TYPE_OUT=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution_rectangle(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    short matrix_coeff[81] =
-    {
-        MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8,
-        MAT9, MAT10, MAT11, MAT12, MAT13, MAT14, MAT15, MAT16, MAT17,
-        MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25, MAT26,
-        MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35,
-        MAT36, MAT37, MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44,
-        MAT45, MAT46, MAT47, MAT48, MAT49, MAT50, MAT51, MAT52, MAT53,
-        MAT54, MAT55, MAT56, MAT57, MAT58, MAT59, MAT60, MAT61, MAT62,
-        MAT63, MAT64, MAT65, MAT66, MAT67, MAT68, MAT69, MAT70, MAT71,
-        MAT72, MAT73, MAT74, MAT75, MAT76, MAT77, MAT78, MAT79, MAT80
-    };
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    pixels = (VEC_DATA_TYPE(DATA_TYPE, 8))0;
-
-    for(int i = 0; i < MATRIX_HEIGHT; i++)
-    {
-#if MATRIX_WIDTH == 3
-        pixels += convolution1x3(offset(&src, -1, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 3], matrix_coeff[1 + i * 3],
-                                 matrix_coeff[2 + i * 3]);
-#endif /* MATRIX_WIDTH */
-
-#if MATRIX_WIDTH == 5
-        pixels += convolution1x5(offset(&src, -2, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 5], matrix_coeff[1 + i * 5],
-                                 matrix_coeff[2 + i * 5], matrix_coeff[3 + i * 5], matrix_coeff[4 + i * 5]);
-#endif /* MATRIX_WIDTH */
-
-#if MATRIX_WIDTH == 7
-        pixels += convolution1x7(offset(&src, -3, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 7], matrix_coeff[1 + i * 7],
-                                 matrix_coeff[2 + i * 7], matrix_coeff[3 + i * 7], matrix_coeff[4 + i * 7],
-                                 matrix_coeff[5 + i * 7], matrix_coeff[6 + i * 7]);
-#endif /* MATRIX_WIDTH */
-
-#if MATRIX_WIDTH == 9
-        pixels += convolution1x9(offset(&src, -4, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 9], matrix_coeff[1 + i * 9],
-                                 matrix_coeff[2 + i * 9], matrix_coeff[3 + i * 9], matrix_coeff[4 + i * 9],
-                                 matrix_coeff[5 + i * 9], matrix_coeff[6 + i * 9], matrix_coeff[7 + i * 9], matrix_coeff[8 + i * 9]);
-#endif /* MATRIX_WIDTH */
-    }
-
-    pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))SCALE;
-
-    // Store the result as is in dst
-    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, ((__global DATA_TYPE_OUT *)dst.ptr));
-}
-
-#endif /* not DYNAMIC_MATRIX_CONVOLUTION */
diff --git a/src/core/CL/cl_kernels/derivative.cl b/src/core/CL/cl_kernels/derivative.cl
deleted file mode 100644
index dddbb4d..0000000
--- a/src/core/CL/cl_kernels/derivative.cl
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This OpenCL kernel that computes the first-order derivative.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void derivative(
-    IMAGE_DECLARATION(src)
-#ifdef GRAD_X
-    ,
-    IMAGE_DECLARATION(dst_gx)
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    ,
-    IMAGE_DECLARATION(dst_gy)
-#endif /* GRAD_Y */
-)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-#ifdef GRAD_X
-    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-
-#ifdef GRAD_X
-    short16 l_data = convert_short16(vload16(0, offset(&src, -1, 0)));
-    short16 r_data = convert_short16(vload16(0, offset(&src, 1, 0)));
-    vstore16(r_data - l_data, 0, ((__global short *)dst_gx.ptr));
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    short16 t_data = convert_short16(vload16(0, offset(&src, 0, -1)));
-    short16 b_data = convert_short16(vload16(0, offset(&src, 0, 1)));
-    vstore16(b_data - t_data, 0, ((__global short *)dst_gy.ptr));
-#endif /* GRAD_Y */
-}
diff --git a/src/core/CL/cl_kernels/dilate.cl b/src/core/CL/cl_kernels/dilate.cl
deleted file mode 100644
index 14362c1..0000000
--- a/src/core/CL/cl_kernels/dilate.cl
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function dilates an input image.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void dilate(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 top    = vload16(0, offset(&src, -1, -1));
-    uchar16 middle = vload16(0, offset(&src, -1, 0));
-    uchar16 bottom = vload16(0, offset(&src, -1, 1));
-
-    uchar16 tmp = max(top, max(middle, bottom));
-    uchar8  out = max(tmp.s01234567, max(tmp.s12345678, tmp.s23456789));
-
-    vstore8(out, 0, dst.ptr);
-}
diff --git a/src/core/CL/cl_kernels/erode.cl b/src/core/CL/cl_kernels/erode.cl
deleted file mode 100644
index 810c5fc..0000000
--- a/src/core/CL/cl_kernels/erode.cl
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function erodes an input image image.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void erode(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 top    = vload16(0, offset(&src, -1, -1));
-    uchar16 middle = vload16(0, offset(&src, -1, 0));
-    uchar16 bottom = vload16(0, offset(&src, -1, 1));
-
-    uchar16 tmp = min(top, min(middle, bottom));
-    uchar8  out = min(tmp.s01234567, min(tmp.s12345678, tmp.s23456789));
-
-    vstore8(out, 0, dst.ptr);
-}
diff --git a/src/core/CL/cl_kernels/fast_corners.cl b/src/core/CL/cl_kernels/fast_corners.cl
deleted file mode 100644
index 89c144a..0000000
--- a/src/core/CL/cl_kernels/fast_corners.cl
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * Copyright (c) 2016-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "types.h"
-
-/* The map table to retrieve the 16 texels in the Bresenham circle of radius 3 with center in P.
- *
- *      . . F 0 1 . . .
- *      . E . . . 2 . .
- *      D . . . . . 3 .
- *      C . . P . . 4 .
- *      B . . . . . 5 .
- *      . A . . . 6 . .
- *      . . 9 8 7 . . .
- */
-constant int offsets_s[16][2] =
-{
-    { 0, -3 },  // 0
-    { 1, -3 },  // 1
-    { 2, -2 },  // 2
-    { 3, -1 },  // 3
-    { 3, 0 },   // 4
-    { 3, 1 },   // 5
-    { 2, 2 },   // 6
-    { 1, 3 },   // 7
-    { 0, 3 },   // 8
-    { -1, 3 },  // 9
-    { -2, 2 },  // A
-    { -3, 1 },  // B
-    { -3, 0 },  // C
-    { -3, -1 }, // D
-    { -2, -2 }, // E
-    { -1, -3 }, // F
-};
-
-/** Load a pixel and set the mask values.
- *
- * @param[in]  ptr         The pointer to the starting address of source image
- * @param[in]  a           Index to indicate the position in the Bresenham circle
- * @param[in]  stride      Stride of source image in x dimension
- * @param[in]  dark        The left end of the threshold range
- * @param[in]  bright      The right end of the threshold range
- * @param[out] dark_mask   The bit-set mask records dark pixels. Its bit is set as 1 if the corresponding pixel is dark
- * @param[out] bright_mask The bit-set mask records bright pixels. Its bit is set as 1 if the corresponding pixel is bright
- *
- */
-#define LOAD_AND_SET_MASK(ptr, a, stride, dark, bright, dark_mask, bright_mask) \
-    {                                                                           \
-        unsigned char pixel;                                                    \
-        pixel = *(ptr + (int)stride * offsets_s[a][1] + offsets_s[a][0]);       \
-        dark_mask |= (pixel < dark) << a;                                       \
-        bright_mask |= (pixel > bright) << a;                                   \
-    }
-
-/** Checks if a pixel is a corner. Pixel is considerred as a corner if the 9 continuous pixels in the Bresenham circle are bright or dark.
- *
- * @param[in]  bright_mask The mask recording postions of bright pixels
- * @param[in]  dark_mask   The mask recording postions of dark pixels
- * @param[out] isCorner    Indicate whether candidate pixel is corner
- */
-#define CHECK_CORNER(bright_mask, dark_mask, isCorner)    \
-    {                                                     \
-        for(int i = 0; i < 16; i++)                       \
-        {                                                 \
-            isCorner |= ((bright_mask & 0x1FF) == 0x1FF); \
-            isCorner |= ((dark_mask & 0x1FF) == 0x1FF);   \
-            if(isCorner)                                  \
-            {                                             \
-                break;                                    \
-            }                                             \
-            bright_mask >>= 1;                            \
-            dark_mask >>= 1;                              \
-        }                                                 \
-    }
-
-/* Calculate pixel's strength */
-uchar compute_strength(uchar candidate_pixel, __global unsigned char *ptr, unsigned int stride, unsigned char threshold)
-{
-    short a = threshold;
-    short b = 255;
-    while(b - a > 1)
-    {
-        uchar        c           = convert_uchar_sat((a + b) / 2);
-        unsigned int bright_mask = 0;
-        unsigned int dark_mask   = 0;
-
-        unsigned char p_bright = add_sat(candidate_pixel, c);
-        unsigned char p_dark   = sub_sat(candidate_pixel, c);
-
-        bool isCorner = 0;
-
-        for(uint i = 0; i < 16; i++)
-        {
-            LOAD_AND_SET_MASK(ptr, i, stride, p_dark, p_bright, dark_mask, bright_mask)
-        }
-
-        bright_mask |= (bright_mask << 16);
-        dark_mask |= (dark_mask << 16);
-        CHECK_CORNER(bright_mask, dark_mask, isCorner);
-
-        if(isCorner)
-        {
-            a = convert_short(c);
-        }
-        else
-        {
-            b = convert_short(c);
-        }
-    }
-    return a;
-}
-
-/** Fast corners implementation. Calculates and returns the strength of each pixel.
- *
- * The algorithm loops through the 16 pixels in the Bresenham circle and set low 16 bit of masks if corresponding pixel is bright
- * or dark. It then copy the low 16 bit to the high 16 bit of the masks. Right shift the bit to check whether the 9 continuous bits
- * from the LSB are set.
- *
- * @param[in]  input_ptr                            Pointer to the first source image. Supported data types: U8
- * @param[in]  input_stride_x                       Stride of the first source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source image
- * @param[out] output_ptr                           Pointer to the first source image. Supported data types: U8
- * @param[in]  output_stride_x                      Stride of the first source image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the first source image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the first source image
- * @param[in]  threshold_value                      Threshold value.
- *
- */
-__kernel void fast_corners(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output),
-    float threshold_value)
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-
-    const unsigned char threshold = (uchar)threshold_value;
-
-    unsigned int bright_mask = 0;
-    unsigned int dark_mask   = 0;
-
-    unsigned char isCorner = 0;
-
-    unsigned char p        = *in.ptr;
-    unsigned char p_bright = add_sat(p, threshold);
-    unsigned char p_dark   = sub_sat(p, threshold);
-
-    LOAD_AND_SET_MASK(in.ptr, 0, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 4, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 8, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 12, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-
-    if(((bright_mask | dark_mask) & 0x1111) == 0)
-    {
-        *out.ptr = 0;
-        return;
-    }
-
-    LOAD_AND_SET_MASK(in.ptr, 1, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 2, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 3, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 5, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 6, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 7, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 9, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 10, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 11, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 13, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 14, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 15, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-
-    bright_mask |= (bright_mask << 16);
-    dark_mask |= (dark_mask << 16);
-
-    CHECK_CORNER(bright_mask, dark_mask, isCorner)
-
-    if(!isCorner)
-    {
-        *out.ptr = 0;
-        return;
-    }
-
-#ifdef USE_MAXSUPPRESSION
-    *out.ptr = compute_strength(p, in.ptr, input_stride_y, threshold);
-#else  /* USE_MAXSUPPRESSION */
-    *out.ptr = 1;
-#endif /* USE_MAXSUPPRESSION */
-}
-
-/** Copy result to Keypoint buffer and count number of corners
- *
- * @param[in]  input_ptr                           Pointer to the image with calculated strenghs. Supported data types: U8
- * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
- * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
- * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
- * @param[in]  max_num_points                      The maximum number of keypoints the array can hold
- * @param[out] offset                              The number of skipped pixels in x dimension
- * @param[out] num_of_points                       Number of points found
- * @param[out] out                                 The keypoints found
- *
- */
-__kernel void copy_to_keypoint(
-    IMAGE_DECLARATION(input),
-    uint     max_num_points,
-    uint     offset,
-    __global uint *num_of_points,
-    __global Keypoint *out)
-{
-#ifndef UPDATE_NUMBER
-    if(*num_of_points >= max_num_points)
-    {
-        return;
-    }
-#endif /* UPDATE_NUMBER */
-
-    Image in = CONVERT_TO_IMAGE_STRUCT(input);
-
-    uchar value = *in.ptr;
-
-    if(value > 0)
-    {
-        int id = atomic_inc(num_of_points);
-        if(id < max_num_points)
-        {
-            out[id].strength        = value;
-            out[id].x               = get_global_id(0) + offset;
-            out[id].y               = get_global_id(1) + offset;
-            out[id].tracking_status = 1;
-            out[id].scale           = 0.f;
-            out[id].orientation     = 0.f;
-            out[id].error           = 0.f;
-        }
-    }
-}
diff --git a/src/core/CL/cl_kernels/gaussian_pyramid.cl b/src/core/CL/cl_kernels/gaussian_pyramid.cl
deleted file mode 100644
index ae2c31a..0000000
--- a/src/core/CL/cl_kernels/gaussian_pyramid.cl
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Computes the Gaussian Filter 1x5 + sub-sampling along the X direction
- *
- * @note Each thread computes 8 pixels
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void gaussian1x5_sub_x(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values for the convolution (20 bytes needed)
-    uchar16 temp0 = vload16(0, src.ptr);
-    uchar4  temp1 = vload4(0, src.ptr + 16);
-
-    // Convert to USHORT8
-    ushort8 l2_data = convert_ushort8((uchar8)(temp0.s02468ACE));
-    ushort8 l1_data = convert_ushort8((uchar8)(temp0.s13579BDF));
-    ushort8 m_data  = convert_ushort8((uchar8)(temp0.s2468, temp0.sACE, temp1.s0));
-    ushort8 r1_data = convert_ushort8((uchar8)(temp0.s3579, temp0.sBDF, temp1.s1));
-    ushort8 r2_data = convert_ushort8((uchar8)(temp0.s468A, temp0.sCE, temp1.s02));
-
-    // Compute convolution along the X direction
-    ushort8 pixels = l2_data + r2_data;
-    pixels += l1_data * (ushort8)4;
-    pixels += m_data * (ushort8)6;
-    pixels += r1_data * (ushort8)4;
-
-    // Store result
-    vstore8(pixels, 0, (__global ushort *)dst.ptr);
-}
-
-/** Computes the Gaussian Filter 5x1 + sub-sampling along the Y direction
- *
- * @note Each thread computes 8 pixels
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void gaussian5x1_sub_y(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values
-    ushort8 u2_data = vload8(0, (__global ushort *)offset(&src, 0, 0));
-    ushort8 u1_data = vload8(0, (__global ushort *)offset(&src, 0, 1));
-    ushort8 m_data  = vload8(0, (__global ushort *)offset(&src, 0, 2));
-    ushort8 d1_data = vload8(0, (__global ushort *)offset(&src, 0, 3));
-    ushort8 d2_data = vload8(0, (__global ushort *)offset(&src, 0, 4));
-
-    // Compute convolution along the Y direction
-    ushort8 pixels = u2_data + d2_data;
-    pixels += u1_data * (ushort8)4;
-    pixels += m_data * (ushort8)6;
-    pixels += d1_data * (ushort8)4;
-
-    // Scale result
-    pixels >>= (ushort8)8;
-
-    // Store result
-    vstore8(convert_uchar8_sat(pixels), 0, dst.ptr);
-}
diff --git a/src/core/CL/cl_kernels/harris_corners.cl b/src/core/CL/cl_kernels/harris_corners.cl
deleted file mode 100644
index 3e3c9fd..0000000
--- a/src/core/CL/cl_kernels/harris_corners.cl
+++ /dev/null
@@ -1,376 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Function running harris score on 3x3 block size
- *
- * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int.
- *             e.g. -DDATA_TYPE=short.
- *
- * @param[in]  src_gx_ptr                           Pointer to the first source image. Supported data types: S16, S32
- * @param[in]  src_gx_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_gx_step_x                        src_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_gx_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_gx_step_y                        src_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_gx_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  src_gy_ptr                           Pointer to the second source image. Supported data types: S16, S32
- * @param[in]  src_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  src_gy_step_x                        src_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  src_gy_step_y                        src_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] vc_ptr                               Pointer to the destination image. Supported data types: F32
- * @param[in]  vc_stride_x                          Stride of the destination image in X dimension (in bytes)
- * @param[in]  vc_step_x                            vc_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  vc_stride_y                          Stride of the destination image in Y dimension (in bytes)
- * @param[in]  vc_step_y                            vc_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  vc_offset_first_element_in_bytes     The offset of the first element in the destination image
- * @param[in]  sensitivity                          Sensitivity threshold k from the Harris-Stephens equation
- * @param[in]  strength_thresh                      Minimum threshold with which to eliminate Harris Corner scores
- * @param[in]  pow4_normalization_factor            Normalization factor to apply harris score
- */
-__kernel void harris_score_3x3(
-    IMAGE_DECLARATION(src_gx),
-    IMAGE_DECLARATION(src_gy),
-    IMAGE_DECLARATION(vc),
-    float sensitivity,
-    float strength_thresh,
-    float pow4_normalization_factor)
-{
-    Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx);
-    Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy);
-    Image vc     = CONVERT_TO_IMAGE_STRUCT(vc);
-
-    /* Gx^2, Gy^2 and Gx*Gy */
-    float4 gx2  = (float4)0.0f;
-    float4 gy2  = (float4)0.0f;
-    float4 gxgy = (float4)0.0f;
-
-    /* Row0 */
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, -1));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, -1));
-
-    float4 l_gx = convert_float4(temp_gx.s0123);
-    float4 m_gx = convert_float4(temp_gx.s1234);
-    float4 r_gx = convert_float4(temp_gx.s2345);
-
-    float4 l_gy = convert_float4(temp_gy.s0123);
-    float4 m_gy = convert_float4(temp_gy.s1234);
-    float4 r_gy = convert_float4(temp_gy.s2345);
-
-    gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx);
-    gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy);
-    gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy);
-
-    /* Row1 */
-    temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, 0));
-    temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, 0));
-
-    l_gx = convert_float4(temp_gx.s0123);
-    m_gx = convert_float4(temp_gx.s1234);
-    r_gx = convert_float4(temp_gx.s2345);
-
-    l_gy = convert_float4(temp_gy.s0123);
-    m_gy = convert_float4(temp_gy.s1234);
-    r_gy = convert_float4(temp_gy.s2345);
-
-    gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx);
-    gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy);
-    gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy);
-
-    /* Row2 */
-    temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, 1));
-    temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, 1));
-
-    l_gx = convert_float4(temp_gx.s0123);
-    m_gx = convert_float4(temp_gx.s1234);
-    r_gx = convert_float4(temp_gx.s2345);
-
-    l_gy = convert_float4(temp_gy.s0123);
-    m_gy = convert_float4(temp_gy.s1234);
-    r_gy = convert_float4(temp_gy.s2345);
-
-    gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx);
-    gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy);
-    gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy);
-
-    /* Compute trace and determinant */
-    float4 trace = gx2 + gy2;
-    float4 det   = gx2 * gy2 - (gxgy * gxgy);
-
-    /* Compute harris score */
-    float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor;
-
-    mc = select(0.0f, mc, mc > (float4)strength_thresh);
-
-    vstore4(mc, 0, (__global float *)vc.ptr);
-}
-
-/** Function for calculating harris score 1x5.
- *
- * @param[in] src_gx Pointer to gx gradient image.
- * @param[in] src_gy Pointer to gy gradient image.
- * @param[in] row    Relative row.
- */
-inline float16 harris_score_1x5(Image *src_gx, Image *src_gy, int row)
-{
-    float4 gx2  = 0.0f;
-    float4 gy2  = 0.0f;
-    float4 gxgy = 0.0f;
-
-    /* Row */
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp_gx = vload8(0, (__global DATA_TYPE *)offset(src_gx, -2, row));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp_gy = vload8(0, (__global DATA_TYPE *)offset(src_gy, -2, row));
-
-    float4 gx = convert_float4(temp_gx.s0123);
-    float4 gy = convert_float4(temp_gy.s0123);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4(temp_gx.s1234);
-    gy = convert_float4(temp_gy.s1234);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4(temp_gx.s2345);
-    gy = convert_float4(temp_gy.s2345);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4(temp_gx.s3456);
-    gy = convert_float4(temp_gy.s3456);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4(temp_gx.s4567);
-    gy = convert_float4(temp_gy.s4567);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    return (float16)(gx2, gy2, gxgy, (float4)0);
-}
-
-/** Function running harris score on 5x5 block size
- *
- * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int.
- *             e.g. -DDATA_TYPE=short.
- *
- * @param[in]  src_gx_ptr                           Pointer to the first source image. Supported data types: S16, S32
- * @param[in]  src_gx_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_gx_step_x                        src_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_gx_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_gx_step_y                        src_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_gx_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  src_gy_ptr                           Pointer to the second source image. Supported data types: S16, S32
- * @param[in]  src_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  src_gy_step_x                        src_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  src_gy_step_y                        src_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] vc_ptr                               Pointer to the destination image. Supported data types: F32
- * @param[in]  vc_stride_x                          Stride of the destination image in X dimension (in bytes)
- * @param[in]  vc_step_x                            vc_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  vc_stride_y                          Stride of the destination image in Y dimension (in bytes)
- * @param[in]  vc_step_y                            vc_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  vc_offset_first_element_in_bytes     The offset of the first element in the destination image
- * @param[in]  sensitivity                          Sensitivity threshold k from the Harris-Stephens equation
- * @param[in]  strength_thresh                      Minimum threshold with which to eliminate Harris Corner scores
- * @param[in]  pow4_normalization_factor            Normalization factor to apply harris score
- */
-__kernel void harris_score_5x5(
-    IMAGE_DECLARATION(src_gx),
-    IMAGE_DECLARATION(src_gy),
-    IMAGE_DECLARATION(vc),
-    float sensitivity,
-    float strength_thresh,
-    float pow4_normalization_factor)
-{
-    Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx);
-    Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy);
-    Image vc     = CONVERT_TO_IMAGE_STRUCT(vc);
-
-    /* Gx^2, Gy^2 and Gx*Gy */
-    float16 res = (float16)0.0f;
-
-    /* Compute row */
-    for(int i = -2; i < 3; i++)
-    {
-        res += harris_score_1x5(&src_gx, &src_gy, i);
-    }
-
-    float4 gx2  = res.s0123;
-    float4 gy2  = res.s4567;
-    float4 gxgy = res.s89AB;
-
-    /* Compute trace and determinant */
-    float4 trace = gx2 + gy2;
-    float4 det   = gx2 * gy2 - (gxgy * gxgy);
-
-    /* Compute harris score */
-    float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor;
-
-    mc = select(0.0f, mc, mc > (float4)strength_thresh);
-
-    vstore4(mc, 0, (__global float *)vc.ptr);
-}
-
-/** Function for calculating harris score 1x7.
- *
- * @param[in] src_gx Pointer to gx gradient image.
- * @param[in] src_gy Pointer to gy gradient image.
- * @param[in] row    Relative row.
- */
-inline float16 harris_score_1x7(Image *src_gx, Image *src_gy, int row)
-{
-    float4 gx2  = 0.0f;
-    float4 gy2  = 0.0f;
-    float4 gxgy = 0.0f;
-
-    /* Row */
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp_gx0 = vload8(0, (__global DATA_TYPE *)offset(src_gx, -3, row));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp_gy0 = vload8(0, (__global DATA_TYPE *)offset(src_gy, -3, row));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    temp_gx1 = vload2(0, (__global DATA_TYPE *)offset(src_gx, 5, row));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    temp_gy1 = vload2(0, (__global DATA_TYPE *)offset(src_gy, 5, row));
-
-    float4 gx = convert_float4(temp_gx0.s0123);
-    float4 gy = convert_float4(temp_gy0.s0123);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4(temp_gx0.s1234);
-    gy = convert_float4(temp_gy0.s1234);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4(temp_gx0.s2345);
-    gy = convert_float4(temp_gy0.s2345);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4(temp_gx0.s3456);
-    gy = convert_float4(temp_gy0.s3456);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4(temp_gx0.s4567);
-    gy = convert_float4(temp_gy0.s4567);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gx0.s567, temp_gx1.s0));
-    gy = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gy0.s567, temp_gy1.s0));
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gx0.s67, temp_gx1.s01));
-    gy = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gy0.s67, temp_gy1.s01));
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    return (float16)(gx2, gy2, gxgy, (float4)0);
-}
-
-/** Function running harris score on 7x7 block size
- *
- * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int.
- *             e.g. -DDATA_TYPE=short.
- *
- * @param[in]  src_gx_ptr                           Pointer to the first source image. Supported data types: S16, S32
- * @param[in]  src_gx_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_gx_step_x                        src_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_gx_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_gx_step_y                        src_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_gx_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  src_gy_ptr                           Pointer to the second source image. Supported data types: S16, S32
- * @param[in]  src_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  src_gy_step_x                        src_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  src_gy_step_y                        src_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] vc_ptr                               Pointer to the destination image. Supported data types: F32
- * @param[in]  vc_stride_x                          Stride of the destination image in X dimension (in bytes)
- * @param[in]  vc_step_x                            vc_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  vc_stride_y                          Stride of the destination image in Y dimension (in bytes)
- * @param[in]  vc_step_y                            vc_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  vc_offset_first_element_in_bytes     The offset of the first element in the destination image
- * @param[in]  sensitivity                          Sensitivity threshold k from the Harris-Stephens equation
- * @param[in]  strength_thresh                      Minimum threshold with which to eliminate Harris Corner scores
- * @param[in]  pow4_normalization_factor            Normalization factor to apply harris score
- */
-__kernel void harris_score_7x7(
-    IMAGE_DECLARATION(src_gx),
-    IMAGE_DECLARATION(src_gy),
-    IMAGE_DECLARATION(vc),
-    float sensitivity,
-    float strength_thresh,
-    float pow4_normalization_factor)
-{
-    Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx);
-    Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy);
-    Image vc     = CONVERT_TO_IMAGE_STRUCT(vc);
-
-    /* Gx^2, Gy^2 and Gx*Gy */
-    float16 res = (float16)0.0f;
-
-    /* Compute row */
-    for(int i = -3; i < 4; i++)
-    {
-        res += harris_score_1x7(&src_gx, &src_gy, i);
-    }
-
-    float4 gx2  = res.s0123;
-    float4 gy2  = res.s4567;
-    float4 gxgy = res.s89AB;
-
-    /* Compute trace and determinant */
-    float4 trace = gx2 + gy2;
-    float4 det   = gx2 * gy2 - (gxgy * gxgy);
-
-    /* Compute harris score */
-    float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor;
-
-    mc = select(0.0f, mc, mc > (float4)strength_thresh);
-
-    vstore4(mc, 0, (__global float *)vc.ptr);
-}
diff --git a/src/core/CL/cl_kernels/histogram.cl b/src/core/CL/cl_kernels/histogram.cl
deleted file mode 100644
index a93cb4d..0000000
--- a/src/core/CL/cl_kernels/histogram.cl
+++ /dev/null
@@ -1,243 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#define VATOMIC_INC16(histogram, win_pos)   \
-    {                                       \
-        atomic_inc(histogram + win_pos.s0); \
-        atomic_inc(histogram + win_pos.s1); \
-        atomic_inc(histogram + win_pos.s2); \
-        atomic_inc(histogram + win_pos.s3); \
-        atomic_inc(histogram + win_pos.s4); \
-        atomic_inc(histogram + win_pos.s5); \
-        atomic_inc(histogram + win_pos.s6); \
-        atomic_inc(histogram + win_pos.s7); \
-        atomic_inc(histogram + win_pos.s8); \
-        atomic_inc(histogram + win_pos.s9); \
-        atomic_inc(histogram + win_pos.sa); \
-        atomic_inc(histogram + win_pos.sb); \
-        atomic_inc(histogram + win_pos.sc); \
-        atomic_inc(histogram + win_pos.sd); \
-        atomic_inc(histogram + win_pos.se); \
-        atomic_inc(histogram + win_pos.sf); \
-    }
-
-/** Calculate the histogram of an 8 bit grayscale image.
- *
- * Each thread will process 16 pixels and use one local atomic operation per pixel.
- * When all work items in a work group are done the resulting local histograms are
- * added to the global histogram using global atomics.
- *
- * @note The input image is represented as a two-dimensional array of type uchar.
- * The output is represented as a one-dimensional uint array of length of num_bins
- *
- * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
- * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
- * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
- * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
- * @param[in]  histogram_local                     The local buffer to hold histogram result in per workgroup. Supported data types: U32
- * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
- * @param[out] num_bins                            The number of bins
- * @param[out] offset                              The start of values to use (inclusive)
- * @param[out] range                               The range of a bin
- * @param[out] offrange                            The maximum value (exclusive)
- */
-__kernel void hist_local_kernel(IMAGE_DECLARATION(input),
-                                __local uint *histogram_local,
-                                __global uint *restrict histogram,
-                                uint                    num_bins,
-                                uint                    offset,
-                                uint                    range,
-                                uint                    offrange)
-{
-    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
-    uint  local_id_x   = get_local_id(0);
-
-    uint local_x_size = get_local_size(0);
-
-    if(num_bins > local_x_size)
-    {
-        for(int i = local_id_x; i < num_bins; i += local_x_size)
-        {
-            histogram_local[i] = 0;
-        }
-    }
-    else
-    {
-        if(local_id_x <= num_bins)
-        {
-            histogram_local[local_id_x] = 0;
-        }
-    }
-
-    uint16 vals = convert_uint16(vload16(0, input_buffer.ptr));
-
-    uint16 win_pos = select(num_bins, ((vals - offset) * num_bins) / range, (vals >= offset && vals < offrange));
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-    VATOMIC_INC16(histogram_local, win_pos);
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(num_bins > local_x_size)
-    {
-        for(int i = local_id_x; i < num_bins; i += local_x_size)
-        {
-            atomic_add(histogram + i, histogram_local[i]);
-        }
-    }
-    else
-    {
-        if(local_id_x <= num_bins)
-        {
-            atomic_add(histogram + local_id_x, histogram_local[local_id_x]);
-        }
-    }
-}
-
-/** Calculate the histogram of an 8 bit grayscale image's border.
- *
- * Each thread will process one pixel using global atomic.
- * When all work items in a work group are done the resulting local histograms are
- * added to the global histogram using global atomics.
- *
- * @note The input image is represented as a two-dimensional array of type uchar.
- * The output is represented as a one-dimensional uint array of length of num_bins
- *
- * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
- * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
- * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
- * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
- * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
- * @param[out] num_bins                            The number of bins
- * @param[out] offset                              The start of values to use (inclusive)
- * @param[out] range                               The range of a bin
- * @param[out] offrange                            The maximum value (exclusive)
- */
-__kernel void hist_border_kernel(IMAGE_DECLARATION(input),
-                                 __global uint *restrict histogram,
-                                 uint                    num_bins,
-                                 uint                    offset,
-                                 uint                    range,
-                                 uint                    offrange)
-{
-    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
-
-    uint val = (uint)(*input_buffer.ptr);
-
-    uint win_pos = (val >= offset) ? (((val - offset) * num_bins) / range) : 0;
-
-    if(val >= offset && (val < offrange))
-    {
-        atomic_inc(histogram + win_pos);
-    }
-}
-
-/** Calculate the histogram of an 8 bit grayscale image with bin size of 256 and window size of 1.
- *
- * Each thread will process 16 pixels and use one local atomic operation per pixel.
- * When all work items in a work group are done the resulting local histograms are
- * added to the global histogram using global atomics.
- *
- * @note The input image is represented as a two-dimensional array of type uchar.
- * The output is represented as a one-dimensional uint array of 256 elements
- *
- * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
- * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
- * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
- * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
- * @param[in]  histogram_local                     The local buffer to hold histogram result in per workgroup. Supported data types: U32
- * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
- */
-__kernel void hist_local_kernel_fixed(IMAGE_DECLARATION(input),
-                                      __local uint *histogram_local,
-                                      __global uint *restrict histogram)
-{
-    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
-
-    uint local_index  = get_local_id(0);
-    uint local_x_size = get_local_size(0);
-
-    for(int i = local_index; i < 256; i += local_x_size)
-    {
-        histogram_local[i] = 0;
-    }
-
-    uint16 vals = convert_uint16(vload16(0, input_buffer.ptr));
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    atomic_inc(histogram_local + vals.s0);
-    atomic_inc(histogram_local + vals.s1);
-    atomic_inc(histogram_local + vals.s2);
-    atomic_inc(histogram_local + vals.s3);
-    atomic_inc(histogram_local + vals.s4);
-    atomic_inc(histogram_local + vals.s5);
-    atomic_inc(histogram_local + vals.s6);
-    atomic_inc(histogram_local + vals.s7);
-    atomic_inc(histogram_local + vals.s8);
-    atomic_inc(histogram_local + vals.s9);
-    atomic_inc(histogram_local + vals.sa);
-    atomic_inc(histogram_local + vals.sb);
-    atomic_inc(histogram_local + vals.sc);
-    atomic_inc(histogram_local + vals.sd);
-    atomic_inc(histogram_local + vals.se);
-    atomic_inc(histogram_local + vals.sf);
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    for(int i = local_index; i < 256; i += local_x_size)
-    {
-        atomic_add(histogram + i, histogram_local[i]);
-    }
-}
-
-/** Calculate the histogram of an 8 bit grayscale image with bin size as 256 and window size as 1.
- *
- * Each thread will process one pixel using global atomic.
- * When all work items in a work group are done the resulting local histograms are
- * added to the global histogram using global atomics.
- *
- * @note The input image is represented as a two-dimensional array of type uchar.
- * The output is represented as a one-dimensional uint array of 256
- *
- * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
- * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
- * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
- * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
- * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
- */
-__kernel void hist_border_kernel_fixed(IMAGE_DECLARATION(input),
-                                       __global uint *restrict histogram)
-{
-    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
-    atomic_inc(histogram + *input_buffer.ptr);
-}
diff --git a/src/core/CL/cl_kernels/hog.cl b/src/core/CL/cl_kernels/hog.cl
deleted file mode 100644
index b14f361..0000000
--- a/src/core/CL/cl_kernels/hog.cl
+++ /dev/null
@@ -1,456 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "types.h"
-
-#if defined(CELL_WIDTH) && defined(CELL_HEIGHT) && defined(NUM_BINS) && defined(PHASE_SCALE)
-
-/** This OpenCL kernel computes the HOG orientation binning
- *
- * @attention The following variables must be passed at compile time:
- *
- * -# -DCELL_WIDTH = Width of the cell
- * -# -DCELL_HEIGHT = height of the cell
- * -# -DNUM_BINS = Number of bins for each cell
- * -# -DPHASE_SCALE = Scale factor used to evaluate the index of the local HOG
- *
- * @note Each work-item computes a single cell
- *
- * @param[in]  mag_ptr                             Pointer to the source image which stores the magnitude of the gradient for each pixel. Supported data types: S16
- * @param[in]  mag_stride_x                        Stride of the magnitude image in X dimension (in bytes)
- * @param[in]  mag_step_x                          mag_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mag_stride_y                        Stride of the magnitude image in Y dimension (in bytes)
- * @param[in]  mag_step_y                          mag_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  mag_offset_first_element_in_bytes   The offset of the first element in the magnitude image
- * @param[in]  phase_ptr                           Pointer to the source image which stores the phase of the gradient for each pixel. Supported data types: U8
- * @param[in]  phase_stride_x                      Stride of the phase image in X dimension (in bytes)
- * @param[in]  phase_step_x                        phase_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  phase_stride_y                      Stride of the the phase image in Y dimension (in bytes)
- * @param[in]  phase_step_y                        phase_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  phase_offset_first_element_in_bytes The offset of the first element in the the phase image
- * @param[out] dst_ptr                             Pointer to the destination image which stores the local HOG for each cell Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
- * @param[in]  dst_stride_x                        Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                          dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                        Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                          dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes   The offset of the first element in the destination image
- */
-__kernel void hog_orientation_binning(IMAGE_DECLARATION(mag),
-                                      IMAGE_DECLARATION(phase),
-                                      IMAGE_DECLARATION(dst))
-{
-    float bins[NUM_BINS] = { 0 };
-
-    // Compute address for the magnitude and phase images
-    Image mag   = CONVERT_TO_IMAGE_STRUCT(mag);
-    Image phase = CONVERT_TO_IMAGE_STRUCT(phase);
-
-    __global uchar *mag_row_ptr   = mag.ptr;
-    __global uchar *phase_row_ptr = phase.ptr;
-
-    for(int yc = 0; yc < CELL_HEIGHT; ++yc)
-    {
-        int xc = 0;
-        for(; xc <= (CELL_WIDTH - 4); xc += 4)
-        {
-            // Load magnitude and phase values
-            const float4 mag_f32   = convert_float4(vload4(0, (__global short *)mag_row_ptr + xc));
-            float4       phase_f32 = convert_float4(vload4(0, phase_row_ptr + xc));
-
-            // Scale phase: phase * scale + 0.5f
-            phase_f32 = (float4)0.5f + phase_f32 * (float4)PHASE_SCALE;
-
-            // Compute histogram index.
-            int4 hidx_s32 = convert_int4(phase_f32);
-
-            // Compute magnitude weights (w0 and w1)
-            const float4 hidx_f32 = convert_float4(hidx_s32);
-
-            // w1 = phase_f32 - hidx_s32
-            const float4 w1_f32 = phase_f32 - hidx_f32;
-
-            // w0 = 1.0 - w1
-            const float4 w0_f32 = (float4)1.0f - w1_f32;
-
-            // Calculate the weights for splitting vote
-            const float4 mag_w0_f32 = mag_f32 * w0_f32;
-            const float4 mag_w1_f32 = mag_f32 * w1_f32;
-
-            // Weighted vote between 2 bins
-
-            // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0
-            hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));
-
-            // Bin 0
-            bins[hidx_s32.s0] += mag_w0_f32.s0;
-            bins[hidx_s32.s1] += mag_w0_f32.s1;
-            bins[hidx_s32.s2] += mag_w0_f32.s2;
-            bins[hidx_s32.s3] += mag_w0_f32.s3;
-
-            hidx_s32 += (int4)1;
-
-            // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0
-            hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));
-
-            // Bin1
-            bins[hidx_s32.s0] += mag_w1_f32.s0;
-            bins[hidx_s32.s1] += mag_w1_f32.s1;
-            bins[hidx_s32.s2] += mag_w1_f32.s2;
-            bins[hidx_s32.s3] += mag_w1_f32.s3;
-        }
-
-        // Left over computation
-        for(; xc < CELL_WIDTH; xc++)
-        {
-            const float mag_value   = *((__global short *)mag_row_ptr + xc);
-            const float phase_value = *(phase_row_ptr + xc) * (float)PHASE_SCALE + 0.5f;
-            const float w1          = phase_value - floor(phase_value);
-
-            // The quantised phase is the histogram index [0, NUM_BINS - 1]
-            // Check limit of histogram index. If hidx == NUM_BINS, hidx = 0
-            const uint hidx = (uint)(phase_value) % NUM_BINS;
-
-            // Weighted vote between 2 bins
-            bins[hidx] += mag_value * (1.0f - w1);
-            bins[(hidx + 1) % NUM_BINS] += mag_value * w1;
-        }
-
-        // Point to the next row of magnitude and phase images
-        mag_row_ptr += mag_stride_y;
-        phase_row_ptr += phase_stride_y;
-    }
-
-    // Compute address for the destination image
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Store the local HOG in the global memory
-    int xc = 0;
-    for(; xc <= (NUM_BINS - 4); xc += 4)
-    {
-        float4 values = vload4(0, bins + xc);
-
-        vstore4(values, 0, ((__global float *)dst.ptr) + xc);
-    }
-
-    // Left over stores
-    for(; xc < NUM_BINS; ++xc)
-    {
-        ((__global float *)dst.ptr)[xc] = bins[xc];
-    }
-}
-#endif /* CELL_WIDTH and CELL_HEIGHT and NUM_BINS and PHASE_SCALE */
-
-#if defined(NUM_CELLS_PER_BLOCK_HEIGHT) && defined(NUM_BINS_PER_BLOCK_X) && defined(NUM_BINS_PER_BLOCK) && defined(HOG_NORM_TYPE) && defined(L2_HYST_THRESHOLD)
-
-#ifndef L2_NORM
-#error The value of enum class HOGNormType::L2_NORM has not be passed to the OpenCL kernel
-#endif /* not L2_NORM */
-
-#ifndef L2HYS_NORM
-#error The value of enum class HOGNormType::L2HYS_NORM has not be passed to the OpenCL kernel
-#endif /* not L2HYS_NORM */
-
-#ifndef L1_NORM
-#error The value of enum class HOGNormType::L1_NORM has not be passed to the OpenCL kernel
-#endif /* not L1_NORM */
-
-/** This OpenCL kernel computes the HOG block normalization
- *
- * @attention The following variables must be passed at compile time:
- *
- * -# -DNUM_CELLS_PER_BLOCK_HEIGHT = Number of cells for each block
- * -# -DNUM_BINS_PER_BLOCK_X = Number of bins for each block along the X direction
- * -# -DNUM_BINS_PER_BLOCK = Number of bins for each block
- * -# -DHOG_NORM_TYPE = Normalization type
- * -# -DL2_HYST_THRESHOLD = Threshold used for L2HYS_NORM normalization method
- * -# -DL2_NORM = Value of the enum class HOGNormType::L2_NORM
- * -# -DL2HYS_NORM = Value of the enum class HOGNormType::L2HYS_NORM
- * -# -DL1_NORM = Value of the enum class HOGNormType::L1_NORM
- *
- * @note Each work-item computes a single block
- *
- * @param[in]  src_ptr                           Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image which stores the normlized HOG Supported data types: F32. Number of channels supported: equal to the number of histogram bins per block
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void hog_block_normalization(IMAGE_DECLARATION(src),
-                                      IMAGE_DECLARATION(dst))
-{
-    float  sum     = 0.0f;
-    float4 sum_f32 = (float4)(0.0f);
-
-    // Compute address for the source and destination tensor
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    for(size_t yc = 0; yc < NUM_CELLS_PER_BLOCK_HEIGHT; ++yc)
-    {
-        const __global float *hist_ptr = (__global float *)(src.ptr + yc * src_stride_y);
-
-        int xc = 0;
-        for(; xc <= (NUM_BINS_PER_BLOCK_X - 16); xc += 16)
-        {
-            const float4 val0 = vload4(0, hist_ptr + xc + 0);
-            const float4 val1 = vload4(0, hist_ptr + xc + 4);
-            const float4 val2 = vload4(0, hist_ptr + xc + 8);
-            const float4 val3 = vload4(0, hist_ptr + xc + 12);
-
-#if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
-            // Compute val^2 for L2_NORM or L2HYS_NORM
-            sum_f32 += val0 * val0;
-            sum_f32 += val1 * val1;
-            sum_f32 += val2 * val2;
-            sum_f32 += val3 * val3;
-#else  /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
-            // Compute |val| for L1_NORM
-            sum_f32 += fabs(val0);
-            sum_f32 += fabs(val1);
-            sum_f32 += fabs(val2);
-            sum_f32 += fabs(val3);
-#endif /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
-
-            // Store linearly the input values un-normalized in the output image. These values will be reused for the normalization.
-            // This approach will help us to be cache friendly in the next for loop where the normalization will be done because all the values
-            // will be accessed consecutively
-            vstore4(val0, 0, ((__global float *)dst.ptr) + xc + 0 + yc * NUM_BINS_PER_BLOCK_X);
-            vstore4(val1, 0, ((__global float *)dst.ptr) + xc + 4 + yc * NUM_BINS_PER_BLOCK_X);
-            vstore4(val2, 0, ((__global float *)dst.ptr) + xc + 8 + yc * NUM_BINS_PER_BLOCK_X);
-            vstore4(val3, 0, ((__global float *)dst.ptr) + xc + 12 + yc * NUM_BINS_PER_BLOCK_X);
-        }
-
-        // Compute left over
-        for(; xc < NUM_BINS_PER_BLOCK_X; ++xc)
-        {
-            const float val = hist_ptr[xc];
-
-#if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
-            sum += val * val;
-#else  /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
-            sum += fabs(val);
-#endif /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
-
-            ((__global float *)dst.ptr)[xc + 0 + yc * NUM_BINS_PER_BLOCK_X] = val;
-        }
-    }
-
-    sum += dot(sum_f32, (float4)1.0f);
-
-    float scale = 1.0f / (sqrt(sum) + NUM_BINS_PER_BLOCK * 0.1f);
-
-#if(HOG_NORM_TYPE == L2HYS_NORM)
-    // Reset sum
-    sum_f32 = (float4)0.0f;
-    sum     = 0.0f;
-
-    int k = 0;
-    for(; k <= NUM_BINS_PER_BLOCK - 16; k += 16)
-    {
-        float4 val0 = vload4(0, ((__global float *)dst.ptr) + k + 0);
-        float4 val1 = vload4(0, ((__global float *)dst.ptr) + k + 4);
-        float4 val2 = vload4(0, ((__global float *)dst.ptr) + k + 8);
-        float4 val3 = vload4(0, ((__global float *)dst.ptr) + k + 12);
-
-        // Scale val
-        val0 = val0 * (float4)scale;
-        val1 = val1 * (float4)scale;
-        val2 = val2 * (float4)scale;
-        val3 = val3 * (float4)scale;
-
-        // Clip val if over _threshold_l2hys
-        val0 = fmin(val0, (float4)L2_HYST_THRESHOLD);
-        val1 = fmin(val1, (float4)L2_HYST_THRESHOLD);
-        val2 = fmin(val2, (float4)L2_HYST_THRESHOLD);
-        val3 = fmin(val3, (float4)L2_HYST_THRESHOLD);
-
-        // Compute val^2
-        sum_f32 += val0 * val0;
-        sum_f32 += val1 * val1;
-        sum_f32 += val2 * val2;
-        sum_f32 += val3 * val3;
-
-        vstore4(val0, 0, ((__global float *)dst.ptr) + k + 0);
-        vstore4(val1, 0, ((__global float *)dst.ptr) + k + 4);
-        vstore4(val2, 0, ((__global float *)dst.ptr) + k + 8);
-        vstore4(val3, 0, ((__global float *)dst.ptr) + k + 12);
-    }
-
-    // Compute left over
-    for(; k < NUM_BINS_PER_BLOCK; ++k)
-    {
-        float val = ((__global float *)dst.ptr)[k] * scale;
-
-        // Clip scaled input_value if over L2_HYST_THRESHOLD
-        val = fmin(val, (float)L2_HYST_THRESHOLD);
-
-        sum += val * val;
-
-        ((__global float *)dst.ptr)[k] = val;
-    }
-
-    sum += dot(sum_f32, (float4)1.0f);
-
-    // We use the same constants of OpenCV
-    scale = 1.0f / (sqrt(sum) + 1e-3f);
-
-#endif /* (HOG_NORM_TYPE == L2HYS_NORM) */
-
-    int i = 0;
-    for(; i <= (NUM_BINS_PER_BLOCK - 16); i += 16)
-    {
-        float4 val0 = vload4(0, ((__global float *)dst.ptr) + i + 0);
-        float4 val1 = vload4(0, ((__global float *)dst.ptr) + i + 4);
-        float4 val2 = vload4(0, ((__global float *)dst.ptr) + i + 8);
-        float4 val3 = vload4(0, ((__global float *)dst.ptr) + i + 12);
-
-        // Multiply val by the normalization scale factor
-        val0 = val0 * (float4)scale;
-        val1 = val1 * (float4)scale;
-        val2 = val2 * (float4)scale;
-        val3 = val3 * (float4)scale;
-
-        vstore4(val0, 0, ((__global float *)dst.ptr) + i + 0);
-        vstore4(val1, 0, ((__global float *)dst.ptr) + i + 4);
-        vstore4(val2, 0, ((__global float *)dst.ptr) + i + 8);
-        vstore4(val3, 0, ((__global float *)dst.ptr) + i + 12);
-    }
-
-    for(; i < NUM_BINS_PER_BLOCK; ++i)
-    {
-        ((__global float *)dst.ptr)[i] *= scale;
-    }
-}
-#endif /* NUM_CELLS_PER_BLOCK_HEIGHT and NUM_BINS_PER_BLOCK_X and NUM_BINS_PER_BLOCK and HOG_NORM_TYPE and L2_HYST_THRESHOLD */
-
-#if defined(NUM_BLOCKS_PER_DESCRIPTOR_Y) && defined(NUM_BINS_PER_DESCRIPTOR_X) && defined(THRESHOLD) && defined(MAX_NUM_DETECTION_WINDOWS) && defined(IDX_CLASS) && defined(DETECTION_WINDOW_STRIDE_WIDTH) && defined(DETECTION_WINDOW_STRIDE_HEIGHT) && defined(DETECTION_WINDOW_WIDTH) && defined(DETECTION_WINDOW_HEIGHT)
-
-/** This OpenCL kernel computes the HOG detector using linear SVM
- *
- * @attention The following variables must be passed at compile time:
- *
- * -# -DNUM_BLOCKS_PER_DESCRIPTOR_Y = Number of blocks per descriptor along the Y direction
- * -# -DNUM_BINS_PER_DESCRIPTOR_X = Number of bins per descriptor along the X direction
- * -# -DTHRESHOLD = Threshold for the distance between features and SVM classifying plane
- * -# -DMAX_NUM_DETECTION_WINDOWS = Maximum number of possible detection windows. It is equal to the size of the DetectioWindow array
- * -# -DIDX_CLASS = Index of the class to detect
- * -# -DDETECTION_WINDOW_STRIDE_WIDTH = Detection window stride for the X direction
- * -# -DDETECTION_WINDOW_STRIDE_HEIGHT = Detection window stride for the Y direction
- * -# -DDETECTION_WINDOW_WIDTH = Width of the detection window
- * -# -DDETECTION_WINDOW_HEIGHT = Height of the detection window
- *
- * @note Each work-item computes a single detection window
- *
- * @param[in]  src_ptr                           Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  hog_descriptor                    Pointer to HOG descriptor. Supported data types: F32
- * @param[out] dst                               Pointer to DetectionWindow array
- * @param[out] num_detection_windows             Number of objects detected
- */
-__kernel void hog_detector(IMAGE_DECLARATION(src),
-                           __global float *hog_descriptor,
-                           __global DetectionWindow *dst,
-                           __global uint *num_detection_windows)
-{
-    // Check if the DetectionWindow array is full
-    if(*num_detection_windows >= MAX_NUM_DETECTION_WINDOWS)
-    {
-        return;
-    }
-
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-
-    const int src_step_y_f32 = src_stride_y / sizeof(float);
-
-    // Init score_f32 with 0
-    float4 score_f32 = (float4)0.0f;
-
-    // Init score with 0
-    float score = 0.0f;
-
-    __global float *src_row_ptr = (__global float *)src.ptr;
-
-    // Compute Linear SVM
-    for(int yb = 0; yb < NUM_BLOCKS_PER_DESCRIPTOR_Y; ++yb, src_row_ptr += src_step_y_f32)
-    {
-        int xb = 0;
-
-        const int offset_y = yb * NUM_BINS_PER_DESCRIPTOR_X;
-
-        for(; xb < (int)NUM_BINS_PER_DESCRIPTOR_X - 8; xb += 8)
-        {
-            // Load descriptor values
-            float4 a0_f32 = vload4(0, src_row_ptr + xb + 0);
-            float4 a1_f32 = vload4(0, src_row_ptr + xb + 4);
-
-            float4 b0_f32 = vload4(0, hog_descriptor + xb + 0 + offset_y);
-            float4 b1_f32 = vload4(0, hog_descriptor + xb + 4 + offset_y);
-
-            // Multiply accumulate
-            score_f32 += a0_f32 * b0_f32;
-            score_f32 += a1_f32 * b1_f32;
-        }
-
-        for(; xb < NUM_BINS_PER_DESCRIPTOR_X; ++xb)
-        {
-            const float a = src_row_ptr[xb];
-            const float b = hog_descriptor[xb + offset_y];
-
-            score += a * b;
-        }
-    }
-
-    score += dot(score_f32, (float4)1.0f);
-
-    // Add the bias. The bias is located at the position (descriptor_size() - 1)
-    // (descriptor_size - 1) = NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y
-    score += hog_descriptor[NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y];
-
-    if(score > (float)THRESHOLD)
-    {
-        int id = atomic_inc(num_detection_windows);
-        if(id < MAX_NUM_DETECTION_WINDOWS)
-        {
-            dst[id].x         = get_global_id(0) * DETECTION_WINDOW_STRIDE_WIDTH;
-            dst[id].y         = get_global_id(1) * DETECTION_WINDOW_STRIDE_HEIGHT;
-            dst[id].width     = DETECTION_WINDOW_WIDTH;
-            dst[id].height    = DETECTION_WINDOW_HEIGHT;
-            dst[id].idx_class = IDX_CLASS;
-            dst[id].score     = score;
-        }
-    }
-}
-#endif /* NUM_BLOCKS_PER_DESCRIPTOR_Y && NUM_BINS_PER_DESCRIPTOR_X && THRESHOLD && MAX_NUM_DETECTION_WINDOWS && IDX_CLASS &&
-        * DETECTION_WINDOW_STRIDE_WIDTH && DETECTION_WINDOW_STRIDE_HEIGHT && DETECTION_WINDOW_WIDTH && DETECTION_WINDOW_HEIGHT */
diff --git a/src/core/CL/cl_kernels/integral_image.cl b/src/core/CL/cl_kernels/integral_image.cl
deleted file mode 100644
index dd2c798..0000000
--- a/src/core/CL/cl_kernels/integral_image.cl
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function computes the horizontal integral of the image.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U32
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void integral_horizontal(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uint prev = 0;
-
-    for(uint j = 0; j < src_step_x; j += 16)
-    {
-        barrier(CLK_GLOBAL_MEM_FENCE);
-        uint16 res = convert_uint16(vload16(0, offset(&src, j, 0)));
-        res.s0 += prev;
-        res.s1 += res.s0;
-        res.s2 += res.s1;
-        res.s3 += res.s2;
-        res.s4 += res.s3;
-        res.s5 += res.s4;
-        res.s6 += res.s5;
-        res.s7 += res.s6;
-        res.s8 += res.s7;
-        res.s9 += res.s8;
-        res.sA += res.s9;
-        res.sB += res.sA;
-        res.sC += res.sB;
-        res.sD += res.sC;
-        res.sE += res.sD;
-        res.sF += res.sE;
-        prev = res.sF;
-        vstore16(res, 0, (__global uint *)offset(&dst, j, 0));
-    }
-}
-
-/** This function computes the vertical integral of the image.
- *
- * @param[in,out] src_ptr                           Pointer to the source image. Supported data types: U32
- * @param[in]     src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]     src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]     src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]     height                            Image height.
- */
-__kernel void integral_vertical(
-    IMAGE_DECLARATION(src),
-    uint height)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-
-    uint8 prev = vload8(0, (__global uint *)offset(&src, 0, 0));
-    for(uint j = 1; j < height; ++j)
-    {
-        barrier(CLK_GLOBAL_MEM_FENCE);
-        uint8 res = vload8(0, (__global uint *)offset(&src, 0, j));
-        res += prev;
-        vstore8(res, 0, (__global uint *)offset(&src, 0, j));
-        prev = res;
-    }
-}
diff --git a/src/core/CL/cl_kernels/magnitude_phase.cl b/src/core/CL/cl_kernels/magnitude_phase.cl
deleted file mode 100644
index 48197d6..0000000
--- a/src/core/CL/cl_kernels/magnitude_phase.cl
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Calculates L1 normalization between two inputs.
- *
- * @param[in] a First input. Supported data types: S16, S32
- * @param[in] b Second input. Supported data types: S16, S32
- *
- * @return L1 normalization magnitude result. Supported data types: S16, S32
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 16) magnitude_l1(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b)
-{
-    return CONVERT_SAT(add_sat(abs(a), abs(b)), VEC_DATA_TYPE(DATA_TYPE, 16));
-}
-
-/** Calculates L2 normalization between two inputs.
- *
- * @param[in] a First input. Supported data types: S16, S32
- * @param[in] b Second input. Supported data types: S16, S32
- *
- * @return L2 normalization magnitude result. Supported data types: S16, S32
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 16) magnitude_l2(int16 a, int16 b)
-{
-    return CONVERT_SAT((sqrt(convert_float16((convert_uint16(a * a) + convert_uint16(b * b)))) + 0.5f),
-                       VEC_DATA_TYPE(DATA_TYPE, 16));
-}
-
-/** Calculates unsigned phase between two inputs.
- *
- * @param[in] a First input. Supported data types: S16, S32
- * @param[in] b Second input. Supported data types: S16, S32
- *
- * @return Unsigned phase mapped in the interval [0, 180]. Supported data types: U8
- */
-inline uchar16 phase_unsigned(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b)
-{
-    float16 angle_deg_f32 = atan2pi(convert_float16(b), convert_float16(a)) * (float16)180.0f;
-    angle_deg_f32         = select(angle_deg_f32, (float16)180.0f + angle_deg_f32, angle_deg_f32 < (float16)0.0f);
-    return convert_uchar16(angle_deg_f32);
-}
-
-/** Calculates signed phase between two inputs.
- *
- * @param[in] a First input. Supported data types: S16, S32
- * @param[in] b Second input. Supported data types: S16, S32
- *
- * @return Signed phase mapped in the interval [0, 256). Supported data types: U8
- */
-inline uchar16 phase_signed(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b)
-{
-    float16 arct = atan2pi(convert_float16(b), convert_float16(a));
-    arct         = select(arct, arct + 2, arct < 0.0f);
-
-    return convert_uchar16(convert_int16(mad(arct, 128, 0.5f)) & (int16)0xFFu);
-}
-
-#if(1 == MAGNITUDE)
-#define MAGNITUDE_OP(x, y) magnitude_l1((x), (y))
-#elif(2 == MAGNITUDE)
-#define MAGNITUDE_OP(x, y) magnitude_l2(convert_int16(x), convert_int16(y))
-#else /* MAGNITUDE */
-#define MAGNITUDE_OP(x, y)
-#endif /* MAGNITUDE */
-
-#if(1 == PHASE)
-#define PHASE_OP(x, y) phase_unsigned((x), (y))
-#elif(2 == PHASE)
-#define PHASE_OP(x, y) phase_signed((x), (y))
-#else /* PHASE */
-#define PHASE_OP(x, y)
-#endif /* PHASE */
-
-/** Calculate the magnitude and phase of given the gradients of an image.
- *
- * @note Magnitude calculation supported: L1 normalization(type = 1) and L2 normalization(type = 2).
- * @note Phase calculation supported: Unsigned(type = 1) [0,128] and Signed(type = 2) [0,256).
- *
- * @attention To enable phase calculation -DPHASE="phase_calculation_type_id" must be provided at compile time. eg -DPHASE=1
- * @attention To enable magnitude calculation -DMAGNITUDE="magnitude_calculation_type_id" must be provided at compile time. eg -DMAGNITUDE=1
- * @attention Datatype of the two inputs is passed at compile time using -DDATA_TYPE. e.g -DDATA_TYPE=short. Supported data_types are: short and int
- *
- * @param[in]  gx_ptr                                  Pointer to the first source image (gradient X). Supported data types: S16, S32
- * @param[in]  gx_stride_x                             Stride of the source image in X dimension (in bytes)
- * @param[in]  gx_step_x                               gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  gx_stride_y                             Stride of the source image in Y dimension (in bytes)
- * @param[in]  gx_step_y                               gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  gx_offset_first_element_in_bytes        The offset of the first element in the source image
- * @param[in]  gy_ptr                                  Pointer to the second source image (gradient Y) . Supported data types: S16, S32
- * @param[in]  gy_stride_x                             Stride of the destination image in X dimension (in bytes)
- * @param[in]  gy_step_x                               gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  gy_stride_y                             Stride of the destination image in Y dimension (in bytes)
- * @param[in]  gy_step_y                               gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  gy_offset_first_element_in_bytes        The offset of the first element in the destination image
- * @param[out] magnitude_ptr                           Pointer to the magnitude destination image. Supported data types: S16, S32
- * @param[in]  magnitude_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  magnitude_step_x                        magnitude_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  magnitude_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  magnitude_step_y                        magnitude_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  magnitude_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] phase_ptr                               Pointer to the phase destination image. Supported data types: U8
- * @param[in]  phase_stride_x                          Stride of the destination image in X dimension (in bytes)
- * @param[in]  phase_step_x                            phase_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  phase_stride_y                          Stride of the destination image in Y dimension (in bytes)
- * @param[in]  phase_step_y                            phase_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  phase_offset_first_element_in_bytes     The offset of the first element in the destination image
- * */
-__kernel void magnitude_phase(
-    IMAGE_DECLARATION(gx),
-    IMAGE_DECLARATION(gy)
-#ifdef MAGNITUDE
-    ,
-    IMAGE_DECLARATION(magnitude)
-#endif /* MAGNITUDE */
-#ifdef PHASE
-    ,
-    IMAGE_DECLARATION(phase)
-#endif /* PHASE */
-)
-{
-    // Get pixels pointer
-    Image gx = CONVERT_TO_IMAGE_STRUCT(gx);
-    Image gy = CONVERT_TO_IMAGE_STRUCT(gy);
-
-    // Load values
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    in_a = vload16(0, (__global DATA_TYPE *)gx.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    in_b = vload16(0, (__global DATA_TYPE *)gy.ptr);
-
-    // Calculate and store the results
-#ifdef MAGNITUDE
-    Image magnitude = CONVERT_TO_IMAGE_STRUCT(magnitude);
-    vstore16(MAGNITUDE_OP(in_a, in_b), 0, (__global DATA_TYPE *)magnitude.ptr);
-#endif /* MAGNITUDE */
-#ifdef PHASE
-    Image phase = CONVERT_TO_IMAGE_STRUCT(phase);
-    vstore16(PHASE_OP(in_a, in_b), 0, phase.ptr);
-#endif /* PHASE */
-}
diff --git a/src/core/CL/cl_kernels/mean_stddev.cl b/src/core/CL/cl_kernels/mean_stddev.cl
deleted file mode 100644
index 4ddf931..0000000
--- a/src/core/CL/cl_kernels/mean_stddev.cl
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2016-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-
-/** This function calculates the sum and sum of squares of a given input image.
- *
- * @note To enable calculation sum of squares -DSTDDEV should be passed as a preprocessor argument.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  height                            Height of the input image
- * @param[out] global_sum                        Global sum of all elements
- * @param[out] global_sum_sq                     Global sum of squares of all elements
- */
-__kernel void mean_stddev_accumulate(
-    IMAGE_DECLARATION(src),
-    uint     height,
-    __global ulong *global_sum
-#ifdef STDDEV
-    ,
-    __global ulong *global_sum_sq
-#endif /* STDDEV */
-)
-{
-    // Get pixels pointer
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-
-    uint8 tmp_sum = 0;
-#ifdef STDDEV
-    uint8 tmp_sum_sq = 0;
-#endif /* STDDEV */
-    // Calculate partial sum
-    for(int i = 0; i < height; i++)
-    {
-        // Load data
-        uint8 data = convert_uint8(vload8(0, offset(&src, 0, i)));
-
-        tmp_sum += data;
-#ifdef STDDEV
-        tmp_sum_sq += data * data;
-#endif /* STDDEV */
-    }
-    // Perform reduction
-    tmp_sum.s0123 += tmp_sum.s4567;
-    tmp_sum.s01 += tmp_sum.s23;
-    atom_add(global_sum, tmp_sum.s0 + tmp_sum.s1);
-
-#ifdef STDDEV
-    tmp_sum_sq.s0123 += tmp_sum_sq.s4567;
-    tmp_sum_sq.s01 += tmp_sum_sq.s23;
-    atom_add(global_sum_sq, tmp_sum_sq.s0 + tmp_sum_sq.s1);
-#endif /* STDDEV */
-}
-
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : disable
diff --git a/src/core/CL/cl_kernels/minmaxloc.cl b/src/core/CL/cl_kernels/minmaxloc.cl
deleted file mode 100644
index 1045f22..0000000
--- a/src/core/CL/cl_kernels/minmaxloc.cl
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "types.h"
-
-#ifndef DATA_TYPE_MIN
-#define DATA_TYPE_MIN 0x0
-#endif /* DATA_TYPE_MIN */
-
-#ifndef DATA_TYPE_MAX
-#define DATA_TYPE_MAX 0xFF
-#endif /* DATA_TYPE_MAX */
-
-inline int FloatFlip(float val)
-{
-    union
-    {
-        int   int_val;
-        float flt_val;
-    } u_val;
-    u_val.flt_val = val;
-    return (u_val.int_val >= 0) ? u_val.int_val : u_val.int_val ^ 0x7FFFFFFF;
-}
-
-__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(DATA_TYPE_MIN);
-__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_max = (VEC_DATA_TYPE(DATA_TYPE, 16))(DATA_TYPE_MAX);
-__constant int16 idx16 = (int16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-
-/** This function identifies the min and maximum value of an input image.
- *
- * @note Input image data type must be passed as a preprocessor argument using -DDATA_TYPE.
- * Moreover, the minimum and maximum value of the given data type must be provided using -DDATA_TYPE_MIN and -DDATA_TYPE_MAX respectively.
- * @note In case image width is not a multiple of 16 then -DNON_MULTIPLE_OF_16 must be passed.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] min_max                           Pointer to buffer with minimum value in position 0 and maximum value in position 1
- * @param[in]  width                             Input image width
- */
-__kernel void minmax(
-    IMAGE_DECLARATION(src),
-    __global int *min_max,
-    int           width)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-
-    // Initialize local minimum and local maximum
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    local_min = type_max;
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    local_max = type_min;
-
-    // Calculate min/max of row
-    int i = 0;
-    for(; i + 16 <= width; i += 16)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 16)
-        data      = vload16(0, (__global DATA_TYPE *)offset(&src, i, 0));
-        local_min = min(data, local_min);
-        local_max = max(data, local_max);
-    }
-
-#ifdef NON_MULTIPLE_OF_16
-    // Handle non multiple of 16
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    data = vload16(0, (__global DATA_TYPE *)offset(&src, i, 0));
-#ifdef IS_DATA_TYPE_FLOAT
-    int16 valid_indices = (i + idx16) < width;
-#else  /* IS_DATA_TYPE_FLOAT */
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    valid_indices = CONVERT((i + idx16) < width, VEC_DATA_TYPE(DATA_TYPE, 16));
-#endif /* IS_DATA_TYPE_FLOAT */
-    local_max = max(local_max, select(type_min, data, valid_indices));
-    local_min = min(local_min, select(type_max, data, valid_indices));
-#endif /* NON_MULTIPLE_OF_16 */
-
-    // Perform min/max reduction
-    local_min.s01234567 = min(local_min.s01234567, local_min.s89ABCDEF);
-    local_max.s01234567 = max(local_max.s01234567, local_max.s89ABCDEF);
-
-    local_min.s0123 = min(local_min.s0123, local_min.s4567);
-    local_max.s0123 = max(local_max.s0123, local_max.s4567);
-
-    local_min.s01 = min(local_min.s01, local_min.s23);
-    local_max.s01 = max(local_max.s01, local_max.s23);
-
-    local_min.s0 = min(local_min.s0, local_min.s1);
-    local_max.s0 = max(local_max.s0, local_max.s1);
-
-    // Update global min/max
-#ifdef IS_DATA_TYPE_FLOAT
-    atomic_min(&min_max[0], FloatFlip(local_min.s0));
-    atomic_max(&min_max[1], FloatFlip(local_max.s0));
-#else  /* IS_DATA_TYPE_FLOAT */
-    atomic_min(&min_max[0], local_min.s0);
-    atomic_max(&min_max[1], local_max.s0);
-#endif /* IS_DATA_TYPE_FLOAT */
-}
-
-/** This function counts the min and max occurrences in an image and tags their position.
- *
- * @note -DCOUNT_MIN_MAX should be specified if we want to count the occurrences of the minimum and maximum values.
- * @note -DLOCATE_MIN and/or -DLOCATE_MAX should be specified if we want to store the position of each occurrence on the given array.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  min_max                           Pointer to buffer with minimum value in position 0 and maximum value in position 1
- * @param[out] min_max_count                     Pointer to buffer with minimum value occurrences in position 0 and maximum value occurrences in position 1
- * @param[out] min_loc                           Array that holds the location of the minimum value occurrences
- * @param[in]  max_min_loc_count                 The maximum number of min value occurrences coordinates the array can hold
- * @param[out] max_loc                           Array that holds the location of the maximum value occurrences
- * @param[in]  max_max_loc_count                 The maximum number of max value occurrences coordinates the array can hold
- */
-__kernel void minmaxloc(
-    IMAGE_DECLARATION(src),
-    __global int *min_max,
-    __global uint *min_max_count
-#ifdef LOCATE_MIN
-    ,
-    __global Coordinates2D *min_loc, uint max_min_loc_count
-#endif /* LOCATE_MIN */
-#ifdef LOCATE_MAX
-    ,
-    __global Coordinates2D *max_loc, uint max_max_loc_count
-#endif /* LOCATE_MAX */
-)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-
-#ifdef IS_DATA_TYPE_FLOAT
-    __global float *min_max_ptr = (__global float *)min_max;
-    float           min_value   = min_max_ptr[0];
-    float           max_value   = min_max_ptr[1];
-#else  /* IS_DATA_TYPE_FLOAT */
-    int min_value = min_max[0];
-    int max_value = min_max[1];
-#endif /* IS_DATA_TYPE_FLOAT */
-
-    DATA_TYPE value = *((__global DATA_TYPE *)src.ptr);
-#ifdef COUNT_MIN_MAX
-    if(value == min_value)
-    {
-        uint idx = atomic_inc(&min_max_count[0]);
-#ifdef LOCATE_MIN
-        if(idx < max_min_loc_count)
-        {
-            min_loc[idx].x = get_global_id(0);
-            min_loc[idx].y = get_global_id(1);
-        }
-#endif /* LOCATE_MIN */
-    }
-    if(value == max_value)
-    {
-        uint idx = atomic_inc(&min_max_count[1]);
-#ifdef LOCATE_MAX
-        if(idx < max_max_loc_count)
-        {
-            max_loc[idx].x = get_global_id(0);
-            max_loc[idx].y = get_global_id(1);
-        }
-#endif /* LOCATE_MAX */
-    }
-#endif /* COUNT_MIN_MAX */
-}
diff --git a/src/core/CL/cl_kernels/non_linear_filter3x3.cl b/src/core/CL/cl_kernels/non_linear_filter3x3.cl
deleted file mode 100644
index 93c5024..0000000
--- a/src/core/CL/cl_kernels/non_linear_filter3x3.cl
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "non_linear_filter_helpers.h"
-
-/** This function applies a non linear filter on a 3x3 box basis on an input image.
- *
- * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void non_linear_filter_box3x3(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values
-    uchar16 top    = vload16(0, offset(&src, -1, -1));
-    uchar16 middle = vload16(0, offset(&src, -1, 0));
-    uchar16 bottom = vload16(0, offset(&src, -1, 1));
-
-    // Apply respective filter
-#ifdef MIN
-    uchar16 tmp = min(top, min(middle, bottom));
-    uchar8  out = row_reduce_min_3(tmp);
-#elif defined(MAX)
-    uchar16 tmp = max(top, max(middle, bottom));
-    uchar8  out = row_reduce_max_3(tmp);
-#elif defined(MEDIAN)
-    uchar8 p0  = top.s01234567;
-    uchar8 p1  = top.s12345678;
-    uchar8 p2  = top.s23456789;
-    uchar8 p3  = middle.s01234567;
-    uchar8 p4  = middle.s12345678;
-    uchar8 p5  = middle.s23456789;
-    uchar8 p6  = bottom.s01234567;
-    uchar8 p7  = bottom.s12345678;
-    uchar8 p8  = bottom.s23456789;
-    uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
-#else /* MIN or MAX or MEDIAN */
-#error "Unsupported filter function"
-#endif /* MIN or MAX or MEDIAN */
-
-    // Store result
-    vstore8(out, 0, dst.ptr);
-}
-
-/** This function applies a non linear filter on a 3x3 cross basis on an input image.
- *
- * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void non_linear_filter_cross3x3(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values
-    uchar8  top    = vload8(0, offset(&src, 0, -1));
-    uchar16 middle = vload16(0, offset(&src, -1, 0));
-    uchar8  bottom = vload8(0, offset(&src, 0, 1));
-
-    // Apply respective filter
-#ifdef MIN
-    uchar8 tmp_middle = row_reduce_min_3(middle);
-    uchar8 out        = min(tmp_middle, min(top, bottom));
-#elif defined(MAX)
-    uchar8  tmp_middle = row_reduce_max_3(middle);
-    uchar8  out        = max(tmp_middle, max(top, bottom));
-#elif defined(MEDIAN)
-    uchar8 p0  = top.s01234567;
-    uchar8 p1  = middle.s01234567;
-    uchar8 p2  = middle.s12345678;
-    uchar8 p3  = middle.s23456789;
-    uchar8 p4  = bottom.s01234567;
-    uchar8 out = sort5(p0, p1, p2, p3, p4);
-#else /* MIN or MAX or MEDIAN */
-#error "Unsupported filter function"
-#endif /* MIN or MAX or MEDIAN */
-
-    // Store result
-    vstore8(out, 0, dst.ptr);
-}
-
-/** This function applies a non linear filter on a 3x3 disk basis on an input image.
- *
- * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void non_linear_filter_disk3x3(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values
-    uchar16 top    = vload16(0, offset(&src, -1, -1));
-    uchar16 middle = vload16(0, offset(&src, -1, 0));
-    uchar16 bottom = vload16(0, offset(&src, -1, 1));
-
-    // Apply respective filter
-#ifdef MIN
-    uchar16 tmp = min(top, min(middle, bottom));
-    uchar8  out = row_reduce_min_3(tmp);
-#elif defined(MAX)
-    uchar16 tmp        = max(top, max(middle, bottom));
-    uchar8  out        = row_reduce_max_3(tmp);
-#elif defined(MEDIAN)
-    uchar8 p0  = top.s01234567;
-    uchar8 p1  = top.s12345678;
-    uchar8 p2  = top.s23456789;
-    uchar8 p3  = middle.s01234567;
-    uchar8 p4  = middle.s12345678;
-    uchar8 p5  = middle.s23456789;
-    uchar8 p6  = bottom.s01234567;
-    uchar8 p7  = bottom.s12345678;
-    uchar8 p8  = bottom.s23456789;
-    uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
-#else /* MIN or MAX or MEDIAN */
-#error "Unsupported filter function"
-#endif /* MIN or MAX or MEDIAN */
-
-    // Store result
-    vstore8(out, 0, dst.ptr);
-}
diff --git a/src/core/CL/cl_kernels/non_linear_filter5x5.cl b/src/core/CL/cl_kernels/non_linear_filter5x5.cl
deleted file mode 100644
index 7c87284..0000000
--- a/src/core/CL/cl_kernels/non_linear_filter5x5.cl
+++ /dev/null
@@ -1,483 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "non_linear_filter_helpers.h"
-
-// Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html
-
-/** Sorting network to sort 8 disks of diameter 5 and return their median.
- *
- * @param[in] top2    Values of elements two rows above.
- * @param[in] top     Values of elements one row above.
- * @param[in] middle  Values of middle elements.
- * @param[in] bottom  Values of elements one row below.
- * @param[in] bottom2 Values of elements two rows below.
- *
- * @return Median values for 8 elements.
- */
-inline uchar8 median_disk5x5(uchar16 top2, uchar16 top, uchar16 middle, uchar16 bottom, uchar16 bottom2)
-{
-    uchar8 p0  = top2.s01234567;
-    uchar8 p1  = top2.s12345678;
-    uchar8 p2  = top2.s23456789;
-    uchar8 p3  = top.s01234567;
-    uchar8 p4  = top.s12345678;
-    uchar8 p5  = top.s23456789;
-    uchar8 p6  = top.s3456789A;
-    uchar8 p7  = top.s456789AB;
-    uchar8 p8  = middle.s01234567;
-    uchar8 p9  = middle.s12345678;
-    uchar8 p10 = middle.s23456789;
-    uchar8 p11 = middle.s3456789A;
-    uchar8 p12 = middle.s456789AB;
-    uchar8 p13 = bottom.s01234567;
-    uchar8 p14 = bottom.s12345678;
-    uchar8 p15 = bottom.s23456789;
-    uchar8 p16 = bottom.s3456789A;
-    uchar8 p17 = bottom.s456789AB;
-    uchar8 p18 = bottom2.s01234567;
-    uchar8 p19 = bottom2.s12345678;
-    uchar8 p20 = bottom2.s23456789;
-
-    SORT(p0, p1);
-    SORT(p2, p3);
-    SORT(p4, p5);
-    SORT(p6, p7);
-    SORT(p8, p9);
-    SORT(p10, p11);
-    SORT(p12, p13);
-    SORT(p14, p15);
-    SORT(p16, p17);
-    SORT(p18, p19);
-    SORT(p0, p2);
-    SORT(p1, p3);
-    SORT(p4, p6);
-    SORT(p5, p7);
-    SORT(p8, p10);
-    SORT(p9, p11);
-    SORT(p12, p14);
-    SORT(p13, p15);
-    SORT(p16, p18);
-    SORT(p17, p19);
-    SORT(p1, p2);
-    SORT(p5, p6);
-    SORT(p0, p4);
-    SORT(p3, p7);
-    SORT(p9, p10);
-    SORT(p13, p14);
-    SORT(p8, p12);
-    SORT(p11, p15);
-    SORT(p17, p18);
-    SORT(p16, p20);
-    SORT(p1, p5);
-    SORT(p2, p6);
-    SORT(p9, p13);
-    SORT(p10, p14);
-    SORT(p0, p8);
-    SORT(p7, p15);
-    SORT(p17, p20);
-    SORT(p1, p4);
-    SORT(p3, p6);
-    SORT(p9, p12);
-    SORT(p11, p14);
-    SORT(p18, p20);
-    SORT(p0, p16);
-    SORT(p2, p4);
-    SORT(p3, p5);
-    SORT(p10, p12);
-    SORT(p11, p13);
-    SORT(p1, p9);
-    SORT(p6, p14);
-    SORT(p19, p20);
-    SORT(p3, p4);
-    SORT(p11, p12);
-    SORT(p1, p8);
-    SORT(p2, p10);
-    SORT(p5, p13);
-    SORT(p7, p14);
-    SORT(p3, p11);
-    SORT(p2, p8);
-    SORT(p4, p12);
-    SORT(p7, p13);
-    SORT(p1, p17);
-    SORT(p3, p10);
-    SORT(p5, p12);
-    SORT(p1, p16);
-    SORT(p2, p18);
-    SORT(p3, p9);
-    SORT(p6, p12);
-    SORT(p2, p16);
-    SORT(p3, p8);
-    SORT(p7, p12);
-    SORT(p5, p9);
-    SORT(p6, p10);
-    SORT(p4, p8);
-    SORT(p7, p11);
-    SORT(p3, p19);
-    SORT(p5, p8);
-    SORT(p7, p10);
-    SORT(p3, p18);
-    SORT(p4, p20);
-    SORT(p6, p8);
-    SORT(p7, p9);
-    SORT(p3, p17);
-    SORT(p5, p20);
-    SORT(p7, p8);
-    SORT(p3, p16);
-    SORT(p6, p20);
-    SORT(p5, p17);
-    SORT(p7, p20);
-    SORT(p4, p16);
-    SORT(p6, p18);
-    SORT(p5, p16);
-    SORT(p7, p19);
-    SORT(p7, p18);
-    SORT(p6, p16);
-    SORT(p7, p17);
-    SORT(p10, p18);
-    SORT(p7, p16);
-    SORT(p9, p17);
-    SORT(p8, p16);
-    SORT(p9, p16);
-    SORT(p10, p16);
-
-    return p10;
-}
-
-/** Sorting network to sort 8 boxes of size 5 and return their median.
- *
- * @param[in] top2    Values of elements two rows above.
- * @param[in] top     Values of elements one row above.
- * @param[in] middle  Values of middle elements.
- * @param[in] bottom  Values of elements one row below.
- * @param[in] bottom2 Values of elements two rows below.
- *
- * @return Median values for 8 elements.
- */
-inline uchar8 median_box5x5(uchar16 top2, uchar16 top, uchar16 middle, uchar16 bottom, uchar16 bottom2)
-{
-    uchar8 p0  = top2.s01234567;
-    uchar8 p1  = top2.s12345678;
-    uchar8 p2  = top2.s23456789;
-    uchar8 p3  = top2.s3456789A;
-    uchar8 p4  = top2.s456789AB;
-    uchar8 p5  = top.s01234567;
-    uchar8 p6  = top.s12345678;
-    uchar8 p7  = top.s23456789;
-    uchar8 p8  = top.s3456789A;
-    uchar8 p9  = top.s456789AB;
-    uchar8 p10 = middle.s01234567;
-    uchar8 p11 = middle.s12345678;
-    uchar8 p12 = middle.s23456789;
-    uchar8 p13 = middle.s3456789A;
-    uchar8 p14 = middle.s456789AB;
-    uchar8 p15 = bottom.s01234567;
-    uchar8 p16 = bottom.s12345678;
-    uchar8 p17 = bottom.s23456789;
-    uchar8 p18 = bottom.s3456789A;
-    uchar8 p19 = bottom.s456789AB;
-    uchar8 p20 = bottom2.s01234567;
-    uchar8 p21 = bottom2.s12345678;
-    uchar8 p22 = bottom2.s23456789;
-    uchar8 p23 = bottom2.s3456789A;
-    uchar8 p24 = bottom2.s456789AB;
-
-    SORT(p1, p2);
-    SORT(p0, p1);
-    SORT(p1, p2);
-    SORT(p4, p5);
-    SORT(p3, p4);
-    SORT(p4, p5);
-    SORT(p0, p3);
-    SORT(p2, p5);
-    SORT(p2, p3);
-    SORT(p1, p4);
-    SORT(p1, p2);
-    SORT(p3, p4);
-    SORT(p7, p8);
-    SORT(p6, p7);
-    SORT(p7, p8);
-    SORT(p10, p11);
-    SORT(p9, p10);
-    SORT(p10, p11);
-    SORT(p6, p9);
-    SORT(p8, p11);
-    SORT(p8, p9);
-    SORT(p7, p10);
-    SORT(p7, p8);
-    SORT(p9, p10);
-    SORT(p0, p6);
-    SORT(p4, p10);
-    SORT(p4, p6);
-    SORT(p2, p8);
-    SORT(p2, p4);
-    SORT(p6, p8);
-    SORT(p1, p7);
-    SORT(p5, p11);
-    SORT(p5, p7);
-    SORT(p3, p9);
-    SORT(p3, p5);
-    SORT(p7, p9);
-    SORT(p1, p2);
-    SORT(p3, p4);
-    SORT(p5, p6);
-    SORT(p7, p8);
-    SORT(p9, p10);
-    SORT(p13, p14);
-    SORT(p12, p13);
-    SORT(p13, p14);
-    SORT(p16, p17);
-    SORT(p15, p16);
-    SORT(p16, p17);
-    SORT(p12, p15);
-    SORT(p14, p17);
-    SORT(p14, p15);
-    SORT(p13, p16);
-    SORT(p13, p14);
-    SORT(p15, p16);
-    SORT(p19, p20);
-    SORT(p18, p19);
-    SORT(p19, p20);
-    SORT(p21, p22);
-    SORT(p23, p24);
-    SORT(p21, p23);
-    SORT(p22, p24);
-    SORT(p22, p23);
-    SORT(p18, p21);
-    SORT(p20, p23);
-    SORT(p20, p21);
-    SORT(p19, p22);
-    SORT(p22, p24);
-    SORT(p19, p20);
-    SORT(p21, p22);
-    SORT(p23, p24);
-    SORT(p12, p18);
-    SORT(p16, p22);
-    SORT(p16, p18);
-    SORT(p14, p20);
-    SORT(p20, p24);
-    SORT(p14, p16);
-    SORT(p18, p20);
-    SORT(p22, p24);
-    SORT(p13, p19);
-    SORT(p17, p23);
-    SORT(p17, p19);
-    SORT(p15, p21);
-    SORT(p15, p17);
-    SORT(p19, p21);
-    SORT(p13, p14);
-    SORT(p15, p16);
-    SORT(p17, p18);
-    SORT(p19, p20);
-    SORT(p21, p22);
-    SORT(p23, p24);
-    SORT(p0, p12);
-    SORT(p8, p20);
-    SORT(p8, p12);
-    SORT(p4, p16);
-    SORT(p16, p24);
-    SORT(p12, p16);
-    SORT(p2, p14);
-    SORT(p10, p22);
-    SORT(p10, p14);
-    SORT(p6, p18);
-    SORT(p6, p10);
-    SORT(p10, p12);
-    SORT(p1, p13);
-    SORT(p9, p21);
-    SORT(p9, p13);
-    SORT(p5, p17);
-    SORT(p13, p17);
-    SORT(p3, p15);
-    SORT(p11, p23);
-    SORT(p11, p15);
-    SORT(p7, p19);
-    SORT(p7, p11);
-    SORT(p11, p13);
-    SORT(p11, p12);
-    return p12;
-}
-
-/** This function applies a non linear filter on a 5x5 box basis on an input image.
- *
- * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void non_linear_filter_box5x5(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values
-    uchar16 top2    = vload16(0, offset(&src, -2, -2));
-    uchar16 top     = vload16(0, offset(&src, -2, -1));
-    uchar16 middle  = vload16(0, offset(&src, -2, 0));
-    uchar16 bottom  = vload16(0, offset(&src, -2, 1));
-    uchar16 bottom2 = vload16(0, offset(&src, -2, 2));
-
-    // Apply respective filter
-#ifdef MIN
-    uchar16 tmp = min(middle, min(min(top2, top), min(bottom, bottom2)));
-    uchar8  out = row_reduce_min_5(tmp);
-#elif defined(MAX)
-    uchar16 tmp = max(middle, max(max(top2, top), max(bottom, bottom2)));
-    uchar8  out = row_reduce_max_5(tmp);
-#elif defined(MEDIAN)
-    uchar8 out = median_box5x5(top2, top, middle, bottom, bottom2);
-#else /* MIN or MAX or MEDIAN */
-#error "Unsupported filter function"
-#endif /* MIN or MAX or MEDIAN */
-
-    // Store result
-    vstore8(out, 0, dst.ptr);
-}
-
-/** This function applies a non linear filter on a 5x5 cross basis on an input image.
- *
- * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void non_linear_filter_cross5x5(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values
-    uchar8  top2    = vload8(0, offset(&src, 0, -2));
-    uchar8  top     = vload8(0, offset(&src, 0, -1));
-    uchar16 middle  = vload16(0, offset(&src, -2, 0));
-    uchar8  bottom  = vload8(0, offset(&src, 0, 1));
-    uchar8  bottom2 = vload8(0, offset(&src, 0, 2));
-
-    // Apply respective filter
-#ifdef MIN
-    uchar8 tmp_middle = row_reduce_min_5(middle);
-    uchar8 out        = min(tmp_middle, min(min(top2, top), min(bottom, bottom2)));
-#elif defined(MAX)
-    uchar8  tmp_middle = row_reduce_max_5(middle);
-    uchar8  out        = max(tmp_middle, max(max(top2, top.s01234567), max(bottom, bottom2)));
-#elif defined(MEDIAN)
-    uchar8 p0  = top2;
-    uchar8 p1  = top;
-    uchar8 p2  = middle.s01234567;
-    uchar8 p3  = middle.s12345678;
-    uchar8 p4  = middle.s23456789;
-    uchar8 p5  = middle.s3456789A;
-    uchar8 p6  = middle.s456789AB;
-    uchar8 p7  = bottom;
-    uchar8 p8  = bottom2;
-    uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
-#else /* MIN or MAX or MEDIAN */
-#error "Unsupported filter function"
-#endif /* MIN or MAX or MEDIAN */
-
-    // Store result
-    vstore8(out, 0, dst.ptr);
-}
-
-/** This function applies a non linear filter on a 5x5 disk basis on an input image.
- *
- * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void non_linear_filter_disk5x5(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values
-    uchar16 top2    = vload16(0, offset(&src, -2, -2));
-    uchar16 top     = vload16(0, offset(&src, -2, -1));
-    uchar16 middle  = vload16(0, offset(&src, -2, 0));
-    uchar16 bottom  = vload16(0, offset(&src, -2, 1));
-    uchar16 bottom2 = vload16(0, offset(&src, -2, 2));
-
-    // Shift top2 and bottom2 values
-    top2    = top2.s123456789ABCDEFF;
-    bottom2 = bottom2.s123456789ABCDEFF;
-
-    // Apply respective filter
-#ifdef MIN
-    uchar16 tmp_3     = min(top2, bottom2);
-    uchar16 tmp_5     = min(middle, min(top, bottom));
-    uchar8  tmp_3_red = row_reduce_min_3(tmp_3);
-    uchar8  tmp_5_red = row_reduce_min_5(tmp_5);
-    uchar8  out       = min(tmp_3_red, tmp_5_red);
-#elif defined(MAX)
-    uchar16 tmp_3      = max(top2, bottom2);
-    uchar16 tmp_5      = max(middle, max(top, bottom));
-    uchar8  tmp_3_red  = row_reduce_max_3(tmp_3);
-    uchar8  tmp_5_red  = row_reduce_max_5(tmp_5);
-    uchar8  out        = max(tmp_3_red, tmp_5_red);
-#elif defined(MEDIAN)
-    uchar8 out = median_disk5x5(top2, top, middle, bottom, bottom2);
-#else /* MIN or MAX or MEDIAN */
-#error "Unsupported filter function"
-#endif /* MIN or MAX or MEDIAN */
-
-    // Store result
-    vstore8(out, 0, dst.ptr);
-}
diff --git a/src/core/CL/cl_kernels/non_linear_filter_helpers.h b/src/core/CL/cl_kernels/non_linear_filter_helpers.h
deleted file mode 100644
index 3fcfad4..0000000
--- a/src/core/CL/cl_kernels/non_linear_filter_helpers.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/** Sorts element-wise two vectors.
- *
- * @param[in, out] a First vector
- * @param[in, out] b Second vector
- */
-#define SORT(a, b)                  \
-    {                               \
-        uchar8 min_val = min(a, b); \
-        uchar8 max_val = max(a, b); \
-        a              = min_val;   \
-        b              = max_val;   \
-    }
-
-// Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html
-
-/** Sorting network to sort 5 vectors of 8 elements and return their median.
- *
- * @param[in] p0 First element vector
- * @param[in] p1 Second element vector
- * @param[in] p2 Third element vector
- * @param[in] p3 Fourth element vector
- * @param[in] p4 Fifth element vector
- *
- * @return Median values for 8 elements.
- */
-inline uchar8 sort5(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4)
-{
-    SORT(p0, p1);
-    SORT(p2, p3);
-    SORT(p0, p2);
-    SORT(p1, p3);
-    SORT(p1, p2);
-    SORT(p0, p4);
-    SORT(p1, p4);
-    SORT(p2, p4);
-
-    return p2;
-}
-
-/** Sorting network to sort 9 vectors of 8 elements and return their median.
- *
- * @param[in] p0 First element vector
- * @param[in] p1 Second element vector
- * @param[in] p2 Third element vector
- * @param[in] p3 Fourth element vector
- * @param[in] p4 Fifth element vector
- * @param[in] p5 Sixth element vector
- * @param[in] p6 Seventh element vector
- * @param[in] p7 Eigth element vector
- * @param[in] p8 Ninth element vector
- *
- * @return Median values for 8 elements.
- */
-inline uchar8 sort9(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4, uchar8 p5, uchar8 p6, uchar8 p7, uchar8 p8)
-{
-    SORT(p1, p2);
-    SORT(p4, p5);
-    SORT(p7, p8);
-    SORT(p0, p1);
-    SORT(p3, p4);
-    SORT(p6, p7);
-    SORT(p1, p2);
-    SORT(p4, p5);
-    SORT(p7, p8);
-    SORT(p0, p3);
-    SORT(p5, p8);
-    SORT(p4, p7);
-    SORT(p3, p6);
-    SORT(p1, p4);
-    SORT(p2, p5);
-    SORT(p4, p7);
-    SORT(p4, p2);
-    SORT(p6, p4);
-    SORT(p4, p2);
-
-    return p4;
-}
-
-/** Calculate the minimum of a sliding window of size 3.
- *
- * @param val Values to calculate the minimum values
- *
- * @return Minimum values of 8 elements on a sliding window of size 3.
- */
-inline uchar8 row_reduce_min_3(uchar16 val)
-{
-    return min(val.s01234567, min(val.s12345678, val.s23456789));
-}
-
-/** Calculate the maximum of a sliding window of size 3.
- *
- * @param val Values to calculate the maximum values
- *
- * @return Maximum values of 8 elements on a sliding window of size 3.
- */
-inline uchar8 row_reduce_max_3(uchar16 val)
-{
-    return max(val.s01234567, max(val.s12345678, val.s23456789));
-}
-
-/** Calculate the minimum of a sliding window of size 5.
- *
- * @param val Values to calculate the minimum values
- *
- * @return Minimum values of 8 elements on a sliding window of size 5.
- */
-inline uchar8 row_reduce_min_5(uchar16 val)
-{
-    return min(val.s01234567, min(min(val.s12345678, val.s23456789), min(val.s3456789A, val.s456789AB)));
-}
-
-/** Calculate the maximum of a sliding window of size 5.
- *
- * @param val Values to calculate the maximum values
- *
- * @return Maximum values of 8 elements on a sliding window of size 5.
- */
-inline uchar8 row_reduce_max_5(uchar16 val)
-{
-    return max(val.s01234567, max(max(val.s12345678, val.s23456789), max(val.s3456789A, val.s456789AB)));
-}
diff --git a/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl b/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl
deleted file mode 100644
index 9bbde1a..0000000
--- a/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl
+++ /dev/null
@@ -1,521 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "types.h"
-
-/*
- *The criteria for lost tracking is that the spatial gradient matrix has:
- * - Determinant less than DETERMINANT_THR
- * - or minimum eigenvalue is smaller then EIGENVALUE_THR
- *
- * The thresholds for the determinant and the minimum eigenvalue is
- * defined by the OpenVX spec
- *
- * Note: Also lost tracking happens when the point tracked coordinate is outside
- * the image coordinates
- *
- * https://www.khronos.org/registry/vx/specs/1.0/html/d0/d0c/group__group__vision__function__opticalflowpyrlk.html
- */
-
-/* Internal Lucas-Kanade Keypoint struct */
-typedef struct InternalKeypoint
-{
-    float x;               /**< The x coordinate. */
-    float y;               /**< The y coordinate. */
-    float tracking_status; /**< A zero indicates a lost point. Initialized to 1 by corner detectors. */
-    float dummy;           /**< Dummy member for alignment. */
-} InternalKeypoint;
-
-/** Threshold for the determinant. Used for lost tracking criteria */
-#define DETERMINANT_THR 1.0e-07f
-
-/** Thresholds for minimum eigenvalue. Used for lost tracking criteria */
-#define EIGENVALUE_THR 1.0e-04f
-
-/** Constants used for Lucas-Kanade Algorithm */
-#define W_BITS (14)
-#define FLT_SCALE (1.0f / (float)(1 << 20))
-#define D0 ((float)(1 << W_BITS))
-#define D1 (1.0f / (float)(1 << (W_BITS - 5)))
-
-/** Initializes the internal new points array when the level of pyramid is NOT equal to max.
- *
- * @param[in,out] old_points_internal An array of internal key points that are defined at the old_images high resolution pyramid.
- * @param[in,out] new_points_internal An array of internal key points that are defined at the new_images high resolution pyramid.
- * @param[in]     scale               Scale factor to apply for the new_point coordinates.
- */
-__kernel void init_level(
-    __global float4 *old_points_internal,
-    __global float4 *new_points_internal,
-    const float      scale)
-{
-    int idx = get_global_id(0);
-
-    // Get old and new keypoints
-    float4 old_point = old_points_internal[idx];
-    float4 new_point = new_points_internal[idx];
-
-    // Scale accordingly with the pyramid_scale
-    old_point.xy *= (float2)(2.0f);
-    new_point.xy *= (float2)(2.0f);
-
-    old_points_internal[idx] = old_point;
-    new_points_internal[idx] = new_point;
-}
-
-/** Initializes the internal new points array when the level of pyramid is equal to max.
- *
- * @param[in]     old_points          An array of key points that are defined at the old_images high resolution pyramid.
- * @param[in,out] old_points_internal An array of internal key points that are defined at the old_images high resolution pyramid.
- * @param[out]    new_points_internal An array of internal key points that are defined at the new_images high resolution pyramid.
- * @param[in]     scale               Scale factor to apply for the new_point coordinates.
- */
-__kernel void init_level_max(
-    __global Keypoint *old_points,
-    __global InternalKeypoint *old_points_internal,
-    __global InternalKeypoint *new_points_internal,
-    const float                scale)
-{
-    int idx = get_global_id(0);
-
-    Keypoint old_point = old_points[idx];
-
-    // Get old keypoint to track
-    InternalKeypoint old_point_internal;
-    old_point_internal.x               = old_point.x * scale;
-    old_point_internal.y               = old_point.y * scale;
-    old_point_internal.tracking_status = 1.f;
-
-    // Store internal keypoints
-    old_points_internal[idx] = old_point_internal;
-    new_points_internal[idx] = old_point_internal;
-}
-
-/** Initializes the new_points array when the level of pyramid is equal to max and if use_initial_estimate = 1.
- *
- * @param[in]     old_points           An array of key points that are defined at the old_images high resolution pyramid.
- * @param[in]     new_points_estimates An array of estimate key points that are defined at the old_images high resolution pyramid.
- * @param[in,out] old_points_internal  An array of internal key points that are defined at the old_images high resolution pyramid.
- * @param[out]    new_points_internal  An array of internal key points that are defined at the new_images high resolution pyramid.
- * @param[in]     scale                Scale factor to apply for the new_point coordinates.
- */
-__kernel void init_level_max_initial_estimate(
-    __global Keypoint *old_points,
-    __global Keypoint *new_points_estimates,
-    __global InternalKeypoint *old_points_internal,
-    __global InternalKeypoint *new_points_internal,
-    const float                scale)
-{
-    int idx = get_global_id(0);
-
-    Keypoint         old_point          = old_points[idx];
-    Keypoint         new_point_estimate = new_points_estimates[idx];
-    InternalKeypoint old_point_internal;
-    InternalKeypoint new_point_internal;
-
-    // Get old keypoint to track
-    old_point_internal.x               = old_point.x * scale;
-    old_point_internal.y               = old_point.y * scale;
-    old_point_internal.tracking_status = 1.f;
-
-    // Get new keypoint to track
-    new_point_internal.x               = new_point_estimate.x * scale;
-    new_point_internal.y               = new_point_estimate.y * scale;
-    new_point_internal.tracking_status = new_point_estimate.tracking_status;
-
-    // Store internal keypoints
-    old_points_internal[idx] = old_point_internal;
-    new_points_internal[idx] = new_point_internal;
-}
-
-/** Truncates the coordinates stored in new_points array
- *
- * @param[in]  new_points_internal An array of estimate key points that are defined at the new_images high resolution pyramid.
- * @param[out] new_points          An array of internal key points that are defined at the new_images high resolution pyramid.
- */
-__kernel void finalize(
-    __global InternalKeypoint *new_points_internal,
-    __global Keypoint *new_points)
-{
-    int idx = get_global_id(0);
-
-    // Load internal keypoint
-    InternalKeypoint new_point_internal = new_points_internal[idx];
-
-    // Calculate output point
-    Keypoint new_point;
-    new_point.x               = round(new_point_internal.x);
-    new_point.y               = round(new_point_internal.y);
-    new_point.strength        = 0.f;
-    new_point.scale           = 0.f;
-    new_point.orientation     = 0.f;
-    new_point.tracking_status = new_point_internal.tracking_status;
-    new_point.error           = 0.f;
-
-    // Store new point
-    new_points[idx] = new_point;
-}
-
-/** Computes A11, A12, A22, min_eig, ival, ixval and iyval at level 0th of the pyramid. These values will be used in step 1.
- *
- * @param[in]      old_image_ptr                               Pointer to the input old image. Supported data types: U8
- * @param[in]      old_image_stride_x                          Stride of the input old image in X dimension (in bytes)
- * @param[in]      old_image_step_x                            old_image_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      old_image_stride_y                          Stride of the input old image in Y dimension (in bytes)
- * @param[in]      old_image_step_y                            old_image_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]      old_image_offset_first_element_in_bytes     The offset of the first element in the input old image
- * @param[in]      old_scharr_gx_ptr                           Pointer to the input scharr x image. Supported data types: S16
- * @param[in]      old_scharr_gx_stride_x                      Stride of the input scharr x image in X dimension (in bytes)
- * @param[in]      old_scharr_gx_step_x                        old_scharr_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      old_scharr_gx_stride_y                      Stride of the input scharr x image in Y dimension (in bytes)
- * @param[in]      old_scharr_gx_step_y                        old_scharr_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]      old_scharr_gx_offset_first_element_in_bytes The offset of the first element in the input scharr x image
- * @param[in]      old_scharr_gy_ptr                           Pointer to the input scharr y image. Supported data types: S16
- * @param[in]      old_scharr_gy_stride_x                      Stride of the input scharr y image in X dimension (in bytes)
- * @param[in]      old_scharr_gy_step_x                        old_scharr_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      old_scharr_gy_stride_y                      Stride of the input scharr y image in Y dimension (in bytes)
- * @param[in]      old_scharr_gy_step_y                        old_scharr_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]      old_scharr_gy_offset_first_element_in_bytes The offset of the first element in the input scharr y image
- * @param[in]      old_points                                  An array of key points. Those key points are defined at the old_images high resolution pyramid
- * @param[in, out] new_points                                  An output array of key points. Those key points are defined at the new_images high resolution pyramid
- * @param[out]     coeff                                       It stores | A11 | A12 | A22 | min_eig | for each keypoint
- * @param[out]     iold_val                                    It stores | ival | ixval | iyval | dummy | for each point in the window centered on old_keypoint
- * @param[in]      window_dimension                            The size of the window on which to perform the algorithm
- * @param[in]      window_dimension_pow2                       The squared size of the window on which to perform the algorithm
- * @param[in]      half_window                                 The half size of the window on which to perform the algorithm
- * @param[in]      border_limits                               It stores the right border limit (width - window_dimension - 1, height - window_dimension - 1,)
- * @param[in]      eig_const                                   1.0f / (float)(2.0f * window_dimension * window_dimension)
- * @param[in]      level0                                      It is set to 1 if level 0 of the pyramid
- */
-void __kernel lktracker_stage0(
-    IMAGE_DECLARATION(old_image),
-    IMAGE_DECLARATION(old_scharr_gx),
-    IMAGE_DECLARATION(old_scharr_gy),
-    __global float4 *old_points,
-    __global float4 *new_points,
-    __global float4 *coeff,
-    __global short4 *iold_val,
-    const int        window_dimension,
-    const int        window_dimension_pow2,
-    const int        half_window,
-    const float3     border_limits,
-    const float      eig_const,
-    const int        level0)
-{
-    int idx = get_global_id(0);
-
-    Image old_image     = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_image);
-    Image old_scharr_gx = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_scharr_gx);
-    Image old_scharr_gy = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_scharr_gy);
-
-    // Get old keypoint
-    float2 old_keypoint = old_points[idx].xy - (float2)half_window;
-
-    // Get the floor value
-    float2 iold_keypoint = floor(old_keypoint);
-
-    // Check if using the window dimension we can go out of boundary in the following for loops. If so, invalidate the tracked point
-    if(any(iold_keypoint < border_limits.zz) || any(iold_keypoint >= border_limits.xy))
-    {
-        if(level0 == 1)
-        {
-            // Invalidate tracked point as we are at level 0
-            new_points[idx].s2 = 0.0f;
-        }
-
-        // Not valid coordinate. It sets min_eig to 0.0f
-        coeff[idx].s3 = 0.0f;
-
-        return;
-    }
-
-    // Compute weight for the bilinear interpolation
-    float2 ab = old_keypoint - iold_keypoint;
-
-    // Weight used for Bilinear-Interpolation on Scharr images
-    // w_scharr.s0 = (1.0f - ab.x) * (1.0f - ab.y)
-    // w_scharr.s1 = ab.x * (1.0f - ab.y)
-    // w_scharr.s2 = (1.0f - ab.x) * ab.y
-    // w_scharr.s3 = ab.x * ab.y
-
-    float4 w_scharr;
-    w_scharr.s3  = ab.x * ab.y;
-    w_scharr.s0  = w_scharr.s3 + 1.0f - ab.x - ab.y;
-    w_scharr.s12 = ab - (float2)w_scharr.s3;
-
-    // Weight used for Bilinear-Interpolation on Old and New images
-    // w.s0 = round(w_scharr.s0 * D0)
-    // w.s1 = round(w_scharr.s1 * D0)
-    // w.s2 = round(w_scharr.s2 * D0)
-    // w.s3 = w.s3 = D0 - w.s0 - w.s1 - w.s2
-
-    float4 w;
-    w    = round(w_scharr * (float4)D0);
-    w.s3 = D0 - w.s0 - w.s1 - w.s2; // Added for matching VX implementation
-
-    // G.s0 = A11, G.s1 = A12, G.s2 = A22, G.s3 = min_eig
-    int4 iG = (int4)0;
-
-    // Window offset
-    int window_offset = idx * window_dimension_pow2;
-
-    // Compute Spatial Gradient Matrix G
-    for(ushort ky = 0; ky < window_dimension; ++ky)
-    {
-        int offset_y = iold_keypoint.y + ky;
-        for(ushort kx = 0; kx < window_dimension; ++kx)
-        {
-            int    offset_x = iold_keypoint.x + kx;
-            float4 px;
-
-            // Load values from old_image for computing the bilinear interpolation
-            px = convert_float4((uchar4)(vload2(0, offset(&old_image, offset_x, offset_y)),
-                                         vload2(0, offset(&old_image, offset_x, offset_y + 1))));
-
-            // old_i.s0 = ival, old_i.s1 = ixval, old_i.s2 = iyval, old_i.s3 = dummy
-            float4 old_i;
-
-            // Compute bilinear interpolation (with D1 scale factor) for ival
-            old_i.s0 = dot(px, w) * D1;
-
-            // Load values from old_scharr_gx for computing the bilinear interpolation
-            px = convert_float4((short4)(vload2(0, (__global short *)offset(&old_scharr_gx, offset_x, offset_y)),
-                                         vload2(0, (__global short *)offset(&old_scharr_gx, offset_x, offset_y + 1))));
-
-            // Compute bilinear interpolation for ixval
-            old_i.s1 = dot(px, w_scharr);
-
-            // Load values from old_scharr_gy for computing the bilinear interpolation
-            px = convert_float4((short4)(vload2(0, (__global short *)offset(&old_scharr_gy, offset_x, offset_y)),
-                                         vload2(0, (__global short *)offset(&old_scharr_gy, offset_x, offset_y + 1))));
-
-            // Compute bilinear interpolation for iyval
-            old_i.s2 = dot(px, w_scharr);
-
-            // Rounding (it could be omitted. Used just for matching the VX implementation)
-            int4 iold = convert_int4(round(old_i));
-
-            // Accumulate values in the Spatial Gradient Matrix
-            iG.s0 += (int)(iold.s1 * iold.s1);
-            iG.s1 += (int)(iold.s1 * iold.s2);
-            iG.s2 += (int)(iold.s2 * iold.s2);
-
-            // Store ival, ixval and iyval
-            iold_val[window_offset + kx] = convert_short4(iold);
-        }
-        window_offset += window_dimension;
-    }
-
-    // Scale iA11, iA12 and iA22
-    float4 G = convert_float4(iG) * (float4)FLT_SCALE;
-
-    // Compute minimum eigen value
-    G.s3 = (float)(G.s2 + G.s0 - sqrt(pown(G.s0 - G.s2, 2) + 4.0f * G.s1 * G.s1)) * eig_const;
-
-    // Store A11. A11, A22 and min_eig
-    coeff[idx] = G;
-}
-
-/** Computes the motion vector for a given keypoint
- *
- * @param[in]      new_image_ptr                           Pointer to the input new image. Supported data types: U8
- * @param[in]      new_image_stride_x                      Stride of the input new image in X dimension (in bytes)
- * @param[in]      new_image_step_x                        new_image_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      new_image_stride_y                      Stride of the input new image in Y dimension (in bytes)
- * @param[in]      new_image_step_y                        new_image_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]      new_image_offset_first_element_in_bytes The offset of the first element in the input new image
- * @param[in, out] new_points                              An output array of key points. Those key points are defined at the new_images high resolution pyramid
- * @param[in]      coeff                                   The | A11 | A12 | A22 | min_eig | for each keypoint
- * @param[in]      iold_val                                The | ival | ixval | iyval | dummy | for each point in the window centered on old_keypoint
- * @param[in]      window_dimension                        The size of the window on which to perform the algorithm
- * @param[in]      window_dimension_pow2                   The squared size of the window on which to perform the algorithm
- * @param[in]      half_window                             The half size of the window on which to perform the algorithm
- * @param[in]      num_iterations                          The maximum number of iterations
- * @param[in]      epsilon                                 The value for terminating the algorithm.
- * @param[in]      border_limits                           It stores the right border limit (width - window_dimension - 1, height - window_dimension - 1,)
- * @param[in]      eig_const                               1.0f / (float)(2.0f * window_dimension * window_dimension)
- * @param[in]      level0                                  It is set to 1 if level of pyramid = 0
- * @param[in]      term_epsilon                            It is set to 1 if termination = TERM_CRITERIA_EPSILON
- */
-void __kernel lktracker_stage1(
-    IMAGE_DECLARATION(new_image),
-    __global float4 *new_points,
-    __global float4 *coeff,
-    __global short4 *iold_val,
-    const int        window_dimension,
-    const int        window_dimension_pow2,
-    const int        half_window,
-    const int        num_iterations,
-    const float      epsilon,
-    const float3     border_limits,
-    const float      eig_const,
-    const int        level0,
-    const int        term_epsilon)
-{
-    int   idx       = get_global_id(0);
-    Image new_image = CONVERT_TO_IMAGE_STRUCT_NO_STEP(new_image);
-
-    // G.s0 = A11, G.s1 = A12, G.s2 = A22, G.s3 = min_eig
-    float4 G = coeff[idx];
-
-    // Determinant
-    float D = G.s0 * G.s2 - G.s1 * G.s1;
-
-    // Check if it is a good point to track
-    if(G.s3 < EIGENVALUE_THR || D < DETERMINANT_THR)
-    {
-        if(level0 == 1)
-        {
-            // Invalidate tracked point as we are at level 0
-            new_points[idx].s2 = 0;
-        }
-
-        return;
-    }
-
-    // Compute inverse
-    //D = native_recip(D);
-    D = 1.0 / D;
-
-    // Get new keypoint
-    float2 new_keypoint = new_points[idx].xy - (float)half_window;
-
-    // Get new point
-    float2 out_new_point = new_points[idx].xy;
-
-    // Keep delta obtained in the previous iteration
-    float2 prev_delta = (float2)0.0f;
-
-    int j = 0;
-    while(j < num_iterations)
-    {
-        // Get the floor value
-        float2 inew_keypoint = floor(new_keypoint);
-
-        // Check if using the window dimension we can go out of boundary in the following for loops. If so, invalidate the tracked point
-        if(any(inew_keypoint < border_limits.zz) || any(inew_keypoint >= border_limits.xy))
-        {
-            if(level0 == 1)
-            {
-                // Invalidate tracked point as we are at level 0
-                new_points[idx].s2 = 0.0f;
-            }
-            else
-            {
-                new_points[idx].xy = out_new_point;
-            }
-
-            return;
-        }
-
-        // Compute weight for the bilinear interpolation
-        float2 ab = new_keypoint - inew_keypoint;
-
-        // Weight used for Bilinear-Interpolation on Old and New images
-        // w.s0 = round((1.0f - ab.x) * (1.0f - ab.y) * D0)
-        // w.s1 = round(ab.x * (1.0f - ab.y) * D0)
-        // w.s2 = round((1.0f - ab.x) * ab.y * D0)
-        // w.s3 = D0 - w.s0 - w.s1 - w.s2
-
-        float4 w;
-        w.s3  = ab.x * ab.y;
-        w.s0  = w.s3 + 1.0f - ab.x - ab.y;
-        w.s12 = ab - (float2)w.s3;
-        w     = round(w * (float4)D0);
-        w.s3  = D0 - w.s0 - w.s1 - w.s2;
-
-        // Mismatch vector
-        int2 ib = 0;
-
-        // Old val offset
-        int old_val_offset = idx * window_dimension_pow2;
-
-        for(int ky = 0; ky < window_dimension; ++ky)
-        {
-            for(int kx = 0; kx < window_dimension; ++kx)
-            {
-                // ival, ixval and iyval have been computed in the previous stage
-                int4 old_ival = convert_int4(iold_val[old_val_offset]);
-
-                // Load values from old_image for computing the bilinear interpolation
-                float4 px = convert_float4((uchar4)(vload2(0, offset(&new_image, inew_keypoint.x + kx, inew_keypoint.y + ky)),
-                                                    vload2(0, offset(&new_image, inew_keypoint.x + kx, inew_keypoint.y + ky + 1))));
-
-                // Compute bilinear interpolation on new image
-                int jval = (int)round(dot(px, w) * D1);
-
-                // Compute luminance difference
-                int diff = (int)(jval - old_ival.s0);
-
-                // Accumulate values in mismatch vector
-                ib += (diff * old_ival.s12);
-
-                // Update old val offset
-                old_val_offset++;
-            }
-        }
-
-        float2 b = convert_float2(ib) * (float2)FLT_SCALE;
-
-        // Optical Flow
-        float2 delta;
-
-        delta.x = (float)((G.s1 * b.y - G.s2 * b.x) * D);
-        delta.y = (float)((G.s1 * b.x - G.s0 * b.y) * D);
-
-        // Update new point coordinate
-        new_keypoint += delta;
-
-        out_new_point = new_keypoint + (float2)half_window;
-
-        if(term_epsilon == 1)
-        {
-            float mag2 = dot(delta, delta);
-
-            if(mag2 <= epsilon)
-            {
-                new_points[idx].xy = out_new_point;
-
-                return;
-            }
-        }
-
-        // Check convergence analyzing the previous delta
-        if(j > 0 && all(fabs(delta + prev_delta) < (float2)0.01f))
-        {
-            out_new_point -= delta * (float2)0.5f;
-
-            new_points[idx].xy = out_new_point;
-
-            return;
-        }
-
-        // Update previous delta
-        prev_delta = delta;
-
-        j++;
-    }
-
-    new_points[idx].xy = out_new_point;
-}
diff --git a/src/core/CL/cl_kernels/scharr_filter.cl b/src/core/CL/cl_kernels/scharr_filter.cl
deleted file mode 100644
index d2868b6..0000000
--- a/src/core/CL/cl_kernels/scharr_filter.cl
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This OpenCL kernel computes Scharr3x3.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] dst_gx_ptr                           Pointer to the destination image Supported data types: S16
- * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void scharr3x3(
-    IMAGE_DECLARATION(src)
-#ifdef GRAD_X
-    ,
-    IMAGE_DECLARATION(dst_gx)
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    ,
-    IMAGE_DECLARATION(dst_gy)
-#endif /* GRAD_Y */
-)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-#ifdef GRAD_X
-    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-
-    // Output pixels
-#ifdef GRAD_X
-    short8 gx = (short8)0;
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    short8 gy = (short8)0;
-#endif /* GRAD_Y */
-
-    // Row0
-    uchar16 temp   = vload16(0, offset(&src, -1, -1));
-    short8  left   = convert_short8(temp.s01234567);
-    short8  middle = convert_short8(temp.s12345678);
-    short8  right  = convert_short8(temp.s23456789);
-#ifdef GRAD_X
-    gx += left * (short8)(-3);
-    gx += right * (short8)(+3);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    gy += left * (short8)(-3);
-    gy += middle * (short8)(-10);
-    gy += right * (short8)(-3);
-#endif /* GRAD_Y */
-
-    // Row1
-    temp  = vload16(0, offset(&src, -1, 0));
-    left  = convert_short8(temp.s01234567);
-    right = convert_short8(temp.s23456789);
-#ifdef GRAD_X
-    gx += left * (short8)(-10);
-    gx += right * (short8)(+10);
-#endif /* GRAD_X */
-
-    // Row2
-    temp   = vload16(0, offset(&src, -1, 1));
-    left   = convert_short8(temp.s01234567);
-    middle = convert_short8(temp.s12345678);
-    right  = convert_short8(temp.s23456789);
-#ifdef GRAD_X
-    gx += left * (short8)(-3);
-    gx += right * (short8)(+3);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    gy += left * (short8)(+3);
-    gy += middle * (short8)(+10);
-    gy += right * (short8)(+3);
-#endif /* GRAD_Y */
-
-    // Store results
-#ifdef GRAD_X
-    vstore8(gx, 0, ((__global short *)dst_gx.ptr));
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    vstore8(gy, 0, ((__global short *)dst_gy.ptr));
-#endif /* GRAD_Y */
-}
diff --git a/src/core/CL/cl_kernels/tablelookup.cl b/src/core/CL/cl_kernels/tablelookup.cl
deleted file mode 100644
index 0ef1648..0000000
--- a/src/core/CL/cl_kernels/tablelookup.cl
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function performs table lookup on U8 input/output images.
- *
- * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
- *
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  lut                               LUT table. Supported data types: U8
- */
-__kernel void tablelookup_U8(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst),
-    __global uchar *lut)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    /* Load input data */
-    uchar8 data = vload8(0, src.ptr);
-
-    /* Load lut data */
-    uchar8 lut_data = (uchar8)(lut[data.s0], lut[data.s1], lut[data.s2], lut[data.s3],
-                               lut[data.s4], lut[data.s5], lut[data.s6], lut[data.s7]);
-
-    /* Store result */
-    vstore8(lut_data, 0, dst.ptr);
-}
-
-/** This function performs table lookup on S16 input/output images.
- *
- * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: S16
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  lut                               LUT table. Supported data types: S16
- * @param[in]  offset                            LUT offset
- * @param[in]  count                             Number of elements in the LUT
- */
-__kernel void tablelookup_S16(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst),
-    __global short *lut,
-    uint            offset,
-    uint            count)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    /* Load input data */
-    short8 data = vload8(0, (__global short *)src.ptr);
-
-    /* Load output data */
-    int8 out_data = convert_int8(vload8(0, (__global short *)dst.ptr));
-
-    /* Calculate index */
-    int8 index = convert_int8(data) + (int8)(offset);
-    int8 cond  = (index >= 0 && index < (int8)count);
-    index      = select(0, index, cond);
-
-    /* Load lut data */
-    int8 lut_data = (int8)(lut[index.s0], lut[index.s1], lut[index.s2], lut[index.s3],
-                           lut[index.s4], lut[index.s5], lut[index.s6], lut[index.s7]);
-
-    /* Select output data depending on condition */
-    lut_data = select(out_data, lut_data, cond);
-
-    /* Store result */
-    vstore8(convert_short8(lut_data), 0, (__global short *)dst.ptr);
-}
diff --git a/src/core/CL/cl_kernels/threshold.cl b/src/core/CL/cl_kernels/threshold.cl
deleted file mode 100644
index ff3ac05..0000000
--- a/src/core/CL/cl_kernels/threshold.cl
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Perform binary thresholding on an image.
- *
- * @param[in]  in_ptr                            Pointer to the source image
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the first source image
- * @param[out] out_ptr                           Pointer to the destination image
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  false_val                         False value
- * @param[in]  true_val                          True value
- * @param[in]  threshold                         The thresold value
- */
-__kernel void threshold_binary(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const uchar false_val,
-    const uchar true_val,
-    const uchar threshold)
-{
-    // Get pixels pointer
-    Image in  = CONVERT_TO_IMAGE_STRUCT(in);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
-
-    // Load data
-    uchar16 in_data = vload16(0, in.ptr);
-
-    // Perform binary thresholding
-    in_data = select((uchar16)false_val, (uchar16)true_val, in_data > (uchar16)threshold);
-
-    // Store result
-    vstore16(in_data, 0, out.ptr);
-}
-
-/** Perform range thresholding on an image.
- *
- * @param[in]  in_ptr                            Pointer to the source image
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the first source image
- * @param[out] out_ptr                           Pointer to the destination image
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  false_val                         False value
- * @param[in]  true_val                          True value
- * @param[in]  lower                             Lower threshold
- * @param[in]  upper                             Upper threshold
- */
-__kernel void threshold_range(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const uchar false_val,
-    const uchar true_val,
-    const uchar lower,
-    const uchar upper)
-{
-    // Get pixels pointer
-    Image in  = CONVERT_TO_IMAGE_STRUCT(in);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
-
-    // Load data
-    uchar16 in_data = vload16(0, in.ptr);
-
-    // Perform range thresholding
-    in_data = select((uchar16)true_val, (uchar16)false_val, in_data > (uchar16)upper || in_data < (uchar16)lower);
-
-    // Store result
-    vstore16(in_data, 0, out.ptr);
-}
diff --git a/src/core/CL/cl_kernels/warp_affine.cl b/src/core/CL/cl_kernels/warp_affine.cl
deleted file mode 100644
index 909b920..0000000
--- a/src/core/CL/cl_kernels/warp_affine.cl
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "warp_helpers.h"
-
-/** Returns a vector of floats contaning the matrix coefficients. */
-inline const float8 build_affine_mtx()
-{
-    return (float8)(MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, 0, 0);
-}
-
-/** Transforms 4 2D coordinates using the formula:
- *
- *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
- *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
- *
- * @param[in] coord 2D coordinate to transform.
- * @param[in] mtx   affine matrix
- *
- * @return a int8 containing 4 2D transformed values.
- */
-inline const float8 apply_affine_transform(const float2 coord, const float8 mtx)
-{
-    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
-    // transform [x,x+1,x+2,x+3]
-    const float4 new_x = mad(/*A*/ in_x_coords, (float4)(mtx.s0) /*B*/, mad((float4)(coord.s1), (float4)(mtx.s2), (float4)(mtx.s4)));
-    // transform [y,y+1,y+2,y+3]
-    const float4 new_y = mad(in_x_coords, (float4)(mtx.s1), mad((float4)(coord.s1), (float4)(mtx.s3), (float4)(mtx.s5)));
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-}
-
-/** Performs an affine transform on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8.
- *
- * This kernel performs an affine transform with a 2x3 Matrix M with this method of pixel coordinate translation:
- *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
- *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
- *   output(x,y) = input(x0,y0)
- *
- * @attention The matrix coefficients need to be passed at compile time:\n
- * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=1 -DMAT3=2 -DMAT4=4 -DMAT5=2 "\n
- * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
- * @param[in]  width                             Width of the destination image
- * @param[in]  height                            Height of the destination image
- */
-__kernel void warp_affine_nearest_neighbour(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const int width,
-    const int height)
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
-    vstore4(read_texels4(&in, convert_int8_rtn(clamp_to_border(apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height))), 0, out.ptr);
-}
-
-/** Performs an affine transform on an image interpolating with the BILINEAR method. Input and output are single channel U8.
- *
- * @attention The matrix coefficients need to be passed at compile time:\n
- * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=1 -DMAT3=2 -DMAT4=4 -DMAT5=2 "\n
- * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
- * @param[in]  width                             Width of the destination image
- * @param[in]  height                            Height of the destination image
- */
-__kernel void warp_affine_bilinear(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const int width,
-    const int height)
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
-    vstore4(bilinear_interpolate(&in, apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height), 0, out.ptr);
-}
diff --git a/src/core/CL/cl_kernels/warp_perspective.cl b/src/core/CL/cl_kernels/warp_perspective.cl
deleted file mode 100644
index bed7838..0000000
--- a/src/core/CL/cl_kernels/warp_perspective.cl
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "warp_helpers.h"
-
-/** Returns the perspective matrix */
-inline const float16 build_perspective_mtx()
-{
-    return (float16)(MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, 0, 0, 0, (float4)0);
-}
-
-/** Transforms four 2D coordinates using the formula:
- *
- *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
- *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
- *   z0 = M[3][1] * x + M[3][2] * y + M[3][3]
- *
- *   (x0/z0,y0/z0)
- *
- * @param[in] coord 2D coordinate to transform.
- * @param[in] mtx   perspective matrix
- *
- * @return a vector float8 containing four 2D transformed values.
- */
-inline const float8 apply_perspective_transform(const float2 coord, const float16 mtx)
-{
-    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
-    // transform [z,z+1,z+2,z+3]
-    const float4 z = (float4)mad(in_x_coords, (float4)(mtx.s2), mad((float4)(coord.s1), (float4)(mtx.s5), (float4)(mtx.s8)));
-    // NOTE: Do not multiply x&y by 1.f/Z as this will result in loss of accuracy and mismatches with VX reference implementation
-    // transform [x,x+1,x+2,x+3]
-    const float4 new_x = (float4)mad(in_x_coords, (float4)(mtx.s0), mad((float4)(coord.s1), (float4)(mtx.s3), (float4)(mtx.s6))) / z;
-    // transform [y,y+1,y+2,y+3]
-    const float4 new_y = (float4)mad(in_x_coords, (float4)(mtx.s1), mad((float4)(coord.s1), (float4)(mtx.s4), (float4)(mtx.s7))) / z;
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-}
-
-/** Performs perspective transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8.
- *
- * This kernel performs perspective transform with a 3x3 Matrix M with this method of pixel coordinate translation:
- *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
- *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
- *   z0 = M[3][1] * x + M[3][2] * y + M[3][3]
- *
- *   output(x,y) = input(x0/z0,y0/z0)
- *
- * @attention The matrix coefficients need to be passed at compile time:\n
- * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=3 -DMAT3=4 -DMAT4=5 -DMAT5=6 -DMAT6=7 -DMAT7=8 -DMAT8=9"\n
- * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
- * @param[in]  width                             Width of the destination image
- * @param[in]  height                            Height of the destination image
- */
-__kernel void warp_perspective_nearest_neighbour(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const int width,
-    const int height)
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
-    vstore4(read_texels4(&in, convert_int8_rtn(clamp_to_border(apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height))), 0, out.ptr);
-}
-
-/** Performs a perspective transform on an image interpolating with the BILINEAR method. Input and output are single channel U8.
- *
- * @attention The matrix coefficients need to be passed at compile time:\n
- * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=3 -DMAT3=4 -DMAT4=5 -DMAT5=6 -DMAT6=7 -DMAT7=8 -DMAT8=9"\n
- * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
- * @param[in]  width                             Width of the destination image
- * @param[in]  height                            Height of the destination image
- */
-__kernel void warp_perspective_bilinear(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const int width,
-    const int height)
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
-    vstore4(bilinear_interpolate(&in, apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height), 0, out.ptr);
-}
diff --git a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp
deleted file mode 100644
index 76b60cb..0000000
--- a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-
-#include "src/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLAbsoluteDifferenceKernel::CLAbsoluteDifferenceKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLAbsoluteDifferenceKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
-}
-
-void CLAbsoluteDifferenceKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
-                             "The output image can only be U8 if both input images are U8");
-
-    _input1 = input1;
-    _input2 = input2;
-    _output = output;
-
-    // Set kernel build options
-    std::set<std::string> build_opts;
-    build_opts.insert("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
-    build_opts.insert("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
-    build_opts.insert("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "absdiff", build_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input1_access, input2_access, output_access);
-
-    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
-                                                       input2->info()->valid_region());
-
-    output_access.set_valid_region(win, valid_region);
-
-    ICLKernel::configure_internal(win);
-}
-
-void CLAbsoluteDifferenceKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input1, slice);
-        add_2D_tensor_argument(idx, _input2, slice);
-        add_2D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.h b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.h
deleted file mode 100644
index 28f28fe..0000000
--- a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLABSOLUTEDIFFERENCEKERNEL_H
-#define ARM_COMPUTE_CLABSOLUTEDIFFERENCEKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the absolute difference kernel.
- *
- * Absolute difference is computed by:
- * @f[ output(x,y) = | input1(x,y) - input2(x,y) | @f]
- */
-class CLAbsoluteDifferenceKernel : public ICLKernel
-{
-public:
-    /** Default constructor. */
-    CLAbsoluteDifferenceKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLAbsoluteDifferenceKernel(const CLAbsoluteDifferenceKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLAbsoluteDifferenceKernel &operator=(const CLAbsoluteDifferenceKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLAbsoluteDifferenceKernel(CLAbsoluteDifferenceKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLAbsoluteDifferenceKernel &operator=(CLAbsoluteDifferenceKernel &&) = default;
-    /** Default destructor */
-    ~CLAbsoluteDifferenceKernel() = default;
-
-    /** Set the inputs and output images.
-     *
-     * @param[in]  input1 Source tensor. Data types supported: U8/S16.
-     * @param[in]  input2 Source tensor. Data types supported: U8/S16.
-     * @param[out] output Destination tensor. Data types supported: U8/S16.
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-    /** Set the inputs and output images.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          Source tensor. Data types supported: U8/S16.
-     * @param[in]  input2          Source tensor. Data types supported: U8/S16.
-     * @param[out] output          Destination tensor. Data types supported: U8/S16.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input1; /**< Source tensor 1. */
-    const ICLTensor *_input2; /**< Source tensor 2. */
-    ICLTensor       *_output; /**< Destination tensor. */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLABSOLUTEDIFFERENCEKERNEL_H */
diff --git a/src/core/CL/kernels/CLAccumulateKernel.cpp b/src/core/CL/kernels/CLAccumulateKernel.cpp
deleted file mode 100644
index b0a8eba..0000000
--- a/src/core/CL/kernels/CLAccumulateKernel.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLAccumulateKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-
-namespace arm_compute
-{
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-} // namespace
-
-void CLAccumulateKernel::configure(const ICLTensor *input, ICLTensor *accum)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, accum);
-}
-
-void CLAccumulateKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *accum)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "accumulate");
-
-    // Make sure _kernel is initialized before calling the parent's configure
-    ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration);
-}
-
-void CLAccumulateWeightedKernel::configure(const ICLTensor *input, float alpha, ICLTensor *accum)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, alpha, accum);
-}
-
-void CLAccumulateWeightedKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, float alpha, ICLTensor *accum)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(alpha < 0.0 || alpha > 1.0);
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "accumulate_weighted");
-
-    // Set static kernel arguments
-    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg(idx++, alpha);
-
-    // Configure kernel window
-    ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration);
-}
-
-void CLAccumulateSquaredKernel::configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, shift, accum);
-}
-
-void CLAccumulateSquaredKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, uint32_t shift, ICLTensor *accum)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
-    ARM_COMPUTE_ERROR_ON(shift > 15);
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "accumulate_squared");
-
-    // Set static kernel arguments
-    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg(idx++, shift);
-
-    // Configure kernel window
-    ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration);
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLAccumulateKernel.h b/src/core/CL/kernels/CLAccumulateKernel.h
deleted file mode 100644
index 16a7153..0000000
--- a/src/core/CL/kernels/CLAccumulateKernel.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLACCUMULATEKERNEL_H
-#define ARM_COMPUTE_CLACCUMULATEKERNEL_H
-
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the accumulate kernel.
- *
- * Accumulation is computed by:
- * @f[ accum(x,y) = accum(x,y) + input(x,y) @f]
- */
-class CLAccumulateKernel : public ICLSimple2DKernel
-{
-public:
-    /** Set the input and accumulation tensors.
-     *
-     * @param[in]  input Source tensor. Data types supported: U8.
-     * @param[out] accum Destination tensor. Data types supported: S16.
-     */
-    void configure(const ICLTensor *input, ICLTensor *accum);
-    /** Set the input and accumulation tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8.
-     * @param[out] accum           Destination tensor. Data types supported: S16.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *accum);
-};
-
-/** Interface for the accumulate weighted kernel.
- *
- * Weighted accumulation is computed:
- * @f[ accum(x,y) = (1 - \alpha)*accum(x,y) + \alpha*input(x,y) @f]
- *
- * Where @f$ 0 \le \alpha \le 1 @f$
- * Conceptually, the rounding for this is defined as:
- * @f[ output(x,y)= uint8( (1 - \alpha) * float32( int32( output(x,y) ) ) + \alpha * float32( int32( input(x,y) ) ) ) @f]
-*/
-class CLAccumulateWeightedKernel : public ICLSimple2DKernel
-{
-public:
-    /** Set the input and accumulation images, and the scale value.
-     *
-     * @param[in]     input Source tensor. Data types supported: U8.
-     * @param[in]     alpha Scalar value in the range [0, 1.0]. Data types supported: F32.
-     * @param[in,out] accum Accumulated tensor. Data types supported: U8.
-     */
-    void configure(const ICLTensor *input, float alpha, ICLTensor *accum);
-    /** Set the input and accumulation images, and the scale value.
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Source tensor. Data types supported: U8.
-     * @param[in]     alpha           Scalar value in the range [0, 1.0]. Data types supported: F32.
-     * @param[in,out] accum           Accumulated tensor. Data types supported: U8.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, float alpha, ICLTensor *accum);
-};
-
-/** Interface for the accumulate squared kernel.
- *
- * The accumulation of squares is computed:
- * @f[ accum(x,y) = saturate_{int16} ( (uint16) accum(x,y) + (((uint16)(input(x,y)^2)) >> (shift)) ) @f]
- *
- * Where @f$ 0 \le shift \le 15 @f$
-*/
-class CLAccumulateSquaredKernel : public ICLSimple2DKernel
-{
-public:
-    /** Set the input and accumulation tensors and the shift value.
-     *
-     * @param[in]     input Source tensor. Data types supported: U8.
-     * @param[in]     shift Shift value in the range of [0, 15]. Data types supported: U32.
-     * @param[in,out] accum Accumulated tensor. Data types supported: S16.
-     */
-    void configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum);
-    /** Set the input and accumulation tensors and the shift value.
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Source tensor. Data types supported: U8.
-     * @param[in]     shift           Shift value in the range of [0, 15]. Data types supported: U32.
-     * @param[in,out] accum           Accumulated tensor. Data types supported: S16.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, uint32_t shift, ICLTensor *accum);
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLACCUMULATEKERNEL_H */
diff --git a/src/core/CL/kernels/CLBox3x3Kernel.cpp b/src/core/CL/kernels/CLBox3x3Kernel.cpp
deleted file mode 100644
index 9f493b4..0000000
--- a/src/core/CL/kernels/CLBox3x3Kernel.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLBox3x3Kernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-BorderSize CLBox3x3Kernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void CLBox3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined);
-}
-
-void CLBox3x3Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
-    _input  = input;
-    _output = output;
-
-    // Set build options
-    std::set<std::string> build_opts = { "-DMAT0=1", "-DMAT1=1", "-DMAT2=1",
-                                         "-DMAT3=1", "-DMAT4=1", "-DMAT5=1",
-                                         "-DMAT6=1", "-DMAT7=1", "-DMAT8=1",
-                                         "-DSCALE=9", "-DDATA_TYPE_OUT=uchar"
-                                       };
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "convolution3x3_static", build_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-}
diff --git a/src/core/CL/kernels/CLBox3x3Kernel.h b/src/core/CL/kernels/CLBox3x3Kernel.h
deleted file mode 100644
index 2373c4a..0000000
--- a/src/core/CL/kernels/CLBox3x3Kernel.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLBOX3X3KERNEL_H
-#define ARM_COMPUTE_CLBOX3X3KERNEL_H
-
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the box 3x3 filter kernel.
- *
- */
-class CLBox3x3Kernel : public ICLSimple2DKernel
-{
-public:
-    /**Initialise the kernel's input and output.
-     *
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /**Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-    //Inherited methods overriden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLBOX3X3KERNEL_H */
diff --git a/src/core/CL/kernels/CLCannyEdgeKernel.cpp b/src/core/CL/kernels/CLCannyEdgeKernel.cpp
deleted file mode 100644
index 1fe944c..0000000
--- a/src/core/CL/kernels/CLCannyEdgeKernel.cpp
+++ /dev/null
@@ -1,310 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLCannyEdgeKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-CLGradientKernel::CLGradientKernel()
-    : _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
-{
-}
-
-void CLGradientKernel::configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), gx, gy, magnitude, phase, norm_type);
-}
-
-void CLGradientKernel::configure(const CLCompileContext &compile_context, const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_MSG(data_size_from_type(gx->info()->data_type()) != data_size_from_type(gy->info()->data_type()),
-                             "Gx and Gy must have the same pixel size");
-    ARM_COMPUTE_ERROR_ON_MSG(data_size_from_type(gx->info()->data_type()) != data_size_from_type(magnitude->info()->data_type()),
-                             "Mag must have the same pixel size as Gx and Gy");
-
-    _gx        = gx;
-    _gy        = gy;
-    _magnitude = magnitude;
-    _phase     = phase;
-
-    // Create build opts
-    std::set<std::string> built_opts;
-    built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(gx->info()->data_type()));
-    built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(gx->info()->data_type()));
-
-    // Create kernel
-    const std::string kernel_name = (norm_type == 1) ? std::string("combine_gradients_L1") : std::string("combine_gradients_L2");
-    _kernel                       = create_kernel(compile_context, kernel_name, built_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
-
-    Window win = calculate_max_window(*_gx->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal gx_access(_gx->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal gy_access(_gy->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal mag_access(_magnitude->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, gx_access, gy_access, mag_access, phase_access);
-
-    mag_access.set_valid_region(win, _gx->info()->valid_region());
-    phase_access.set_valid_region(win, _gx->info()->valid_region());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(gx->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gx->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gx->info()->dimension(1));
-}
-
-void CLGradientKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _gx, slice);
-        add_2D_tensor_argument(idx, _gy, slice);
-        add_2D_tensor_argument(idx, _magnitude, slice);
-        add_2D_tensor_argument(idx, _phase, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
-
-CLEdgeNonMaxSuppressionKernel::CLEdgeNonMaxSuppressionKernel()
-    : _magnitude(nullptr), _phase(nullptr), _output(nullptr)
-{
-}
-
-BorderSize CLEdgeNonMaxSuppressionKernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void CLEdgeNonMaxSuppressionKernel::configure(const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), magnitude, phase, output, lower_thr, border_undefined);
-}
-
-void CLEdgeNonMaxSuppressionKernel::configure(const CLCompileContext &compile_context, const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::U32);
-
-    _magnitude = magnitude;
-    _phase     = phase;
-    _output    = output;
-
-    // Create build opts
-    std::set<std::string> built_opts;
-    built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(magnitude->info()->data_type()));
-    built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-
-    // Create kernel
-    const std::string kernel_name = std::string("suppress_non_maximum");
-    _kernel                       = create_kernel(compile_context, kernel_name, built_opts);
-
-    // Set minimum threshold argument
-    unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg(idx++, lower_thr);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration    = 1;
-    constexpr unsigned int num_elems_read_written_per_iteration = 3;
-
-    Window win = calculate_max_window(*_magnitude->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle mag_access(_magnitude->info(), -border_size().left, -border_size().top,
-                                     num_elems_read_written_per_iteration, num_elems_read_written_per_iteration);
-    AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, mag_access, phase_access, output_access);
-
-    output_access.set_valid_region(win, _magnitude->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(output->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_undefined);
-}
-
-void CLEdgeNonMaxSuppressionKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _magnitude, slice);
-        add_2D_tensor_argument(idx, _phase, slice);
-        add_2D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
-
-CLEdgeTraceKernel::CLEdgeTraceKernel()
-    : _input(nullptr), _output(nullptr), _lower_thr(0), _upper_thr(0), _visited(nullptr), _recorded(nullptr), _l1_stack(nullptr), _l1_stack_counter(nullptr)
-{
-}
-
-void CLEdgeTraceKernel::configure(const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr,
-                                  ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, upper_thr, lower_thr, visited, recorded, l1_stack, l1_stack_counter);
-}
-
-void CLEdgeTraceKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr,
-                                  ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::U32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(visited, 1, DataType::U32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(recorded, 1, DataType::U32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(l1_stack, 1, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(l1_stack_counter, 1, DataType::U8);
-
-    _input            = input;
-    _output           = output;
-    _lower_thr        = lower_thr;
-    _upper_thr        = upper_thr;
-    _visited          = visited;
-    _recorded         = recorded;
-    _l1_stack         = l1_stack;
-    _l1_stack_counter = l1_stack_counter;
-
-    // Create build opts
-    std::set<std::string> built_opts;
-    built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
-    built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-
-    // Create kernel
-    const std::string kernel_name = std::string("hysteresis");
-    _kernel                       = create_kernel(compile_context, kernel_name, built_opts);
-
-    // Set constant kernel args
-    unsigned int width  = _input->info()->dimension(0);
-    unsigned int height = _input->info()->dimension(1);
-    unsigned int idx    = 6 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg(idx++, static_cast<cl_uint>(_lower_thr));
-    _kernel.setArg(idx++, static_cast<cl_uint>(_upper_thr));
-    _kernel.setArg(idx++, static_cast<cl_uint>(width));
-    _kernel.setArg(idx++, static_cast<cl_uint>(height));
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-    Window                 win                               = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal visited_access(_visited->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal recorded_access(_recorded->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal l1_stack_access(_l1_stack->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal l1_stack_counter_access(_l1_stack_counter->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(_input->info(), 0, num_elems_processed_per_iteration),
-                              output_access,
-                              visited_access,
-                              recorded_access,
-                              l1_stack_access,
-                              l1_stack_counter_access);
-
-    output_access.set_valid_region(win, _input->info()->valid_region());
-    visited_access.set_valid_region(win, _input->info()->valid_region());
-    recorded_access.set_valid_region(win, _input->info()->valid_region());
-    l1_stack_access.set_valid_region(win, _input->info()->valid_region());
-    l1_stack_counter_access.set_valid_region(win, _input->info()->valid_region());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += lower_string(string_from_format(output->info()->format()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-void CLEdgeTraceKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        add_2D_tensor_argument(idx, _output, slice);
-        add_2D_tensor_argument(idx, _visited, slice);
-        add_2D_tensor_argument(idx, _recorded, slice);
-        add_2D_tensor_argument(idx, _l1_stack, slice);
-        add_2D_tensor_argument(idx, _l1_stack_counter, slice);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLCannyEdgeKernel.h b/src/core/CL/kernels/CLCannyEdgeKernel.h
deleted file mode 100644
index 7543822..0000000
--- a/src/core/CL/kernels/CLCannyEdgeKernel.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCANNYEDGEKERNEL_H
-#define ARM_COMPUTE_CLCANNYEDGEKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform Gradient computation.
- */
-class CLGradientKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGradientKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGradientKernel(const CLGradientKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGradientKernel &operator=(const CLGradientKernel &) = delete;
-    /** Initialise the kernel's sources, destinations and border mode.
-     *
-     * @note gx, gy and mag must all be the same size (either 16 or 32).
-     *
-     * @param[in]  gx        Source tensor - Gx component. Data types supported: S16/S32.
-     * @param[in]  gy        Source tensor - Gy component. Data types supported: Same as gx.
-     * @param[out] magnitude Destination tensor - Magnitude. Data types supported: U16/U32. Must match the pixel size of gx, gy.
-     * @param[out] phase     Destination tensor - Quantized phase. Data types supported: U8.
-     * @param[in]  norm_type Normalization type. if 1, L1-Norm otherwise L2-Norm.
-     */
-    void configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type);
-    /** Initialise the kernel's sources, destinations and border mode.
-     *
-     * @note gx, gy and mag must all be the same size (either 16 or 32).
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  gx              Source tensor - Gx component. Data types supported: S16/S32.
-     * @param[in]  gy              Source tensor - Gy component. Data types supported: Same as gx.
-     * @param[out] magnitude       Destination tensor - Magnitude. Data types supported: U16/U32. Must match the pixel size of gx, gy.
-     * @param[out] phase           Destination tensor - Quantized phase. Data types supported: U8.
-     * @param[in]  norm_type       Normalization type. if 1, L1-Norm otherwise L2-Norm.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_gx;        /**< Source tensor - Gx component */
-    const ICLTensor *_gy;        /**< Source tensor - Gy component */
-    ICLTensor       *_magnitude; /**< Destination tensor - Magnitude */
-    ICLTensor       *_phase;     /**< Destination tensor - Quantized phase */
-};
-
-/** OpenCL kernel to perform Non-Maxima suppression for Canny Edge.
- *
- * @note This kernel is meant to be used alongside CannyEdge and performs a non-maxima suppression using magnitude and phase of input
- *       to characterize points as possible edges. The output buffer needs to be cleared before this kernel is executed.
- *
- * @note Hysteresis is computed in @ref CLEdgeTraceKernel
- */
-class CLEdgeNonMaxSuppressionKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLEdgeNonMaxSuppressionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLEdgeNonMaxSuppressionKernel(const CLEdgeNonMaxSuppressionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLEdgeNonMaxSuppressionKernel &operator=(const CLEdgeNonMaxSuppressionKernel &) = delete;
-    /** Initialise the kernel's sources, destination and border mode.
-     *
-     * @param[in]  magnitude        Source tensor - Magnitude. Data types supported: U16/U32.
-     * @param[in]  phase            Source tensor - Quantized phase. Data types supported: U8.
-     * @param[out] output           Destination tensor. Data types supported: U16/U32.
-     * @param[in]  lower_thr        Lower threshold.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined);
-    /** Initialise the kernel's sources, destination and border mode.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  magnitude        Source tensor - Magnitude. Data types supported: U16/U32.
-     * @param[in]  phase            Source tensor - Quantized phase. Data types supported: U8.
-     * @param[out] output           Destination tensor. Data types supported: U16/U32.
-     * @param[in]  lower_thr        Lower threshold.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_magnitude; /**< Source tensor - Magnitude. */
-    const ICLTensor *_phase;     /**< Source tensor - Quantized phase. */
-    ICLTensor       *_output;    /**< Destination tensor. */
-};
-
-/** OpenCL kernel to perform Edge tracing.
- */
-class CLEdgeTraceKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLEdgeTraceKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLEdgeTraceKernel(const CLEdgeTraceKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLEdgeTraceKernel &operator=(const CLEdgeTraceKernel &) = delete;
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]     input            Source tensor. Data types supported: U16/U32.
-     * @param[out]    output           Destination tensor. Data types supported: U8.
-     * @param[in]     upper_thr        Upper threshold used for the hysteresis
-     * @param[in]     lower_thr        Lower threshold used for the hysteresis
-     * @param[in,out] visited          Tensor for keeping the visited pixels. Data types supported: U32.
-     *                                 Expected to be initialized to 0 before each run.
-     * @param[in,out] recorded         Tensor for keeping the recorded pixels. Data types supported: U32
-     *                                 Expected to be initialized to 0 before each run.
-     * @param[in,out] l1_stack         Tensor with the L1 stack for each pixel. Data types supported: S32.
-     *                                 Expected to be initialized to 0 before each run.
-     * @param[in,out] l1_stack_counter Tensor for counting the elements in the L1 stack of each pixel. Data types supported: U8.
-     *                                              Expected to be initialized to 0 before each run.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr,
-                   ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter);
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]     compile_context  The compile context to be used.
-     * @param[in]     input            Source tensor. Data types supported: U16/U32.
-     * @param[out]    output           Destination tensor. Data types supported: U8.
-     * @param[in]     upper_thr        Upper threshold used for the hysteresis
-     * @param[in]     lower_thr        Lower threshold used for the hysteresis
-     * @param[in,out] visited          Tensor for keeping the visited pixels. Data types supported: U32.
-     *                                 Expected to be initialized to 0 before each run.
-     * @param[in,out] recorded         Tensor for keeping the recorded pixels. Data types supported: U32
-     *                                 Expected to be initialized to 0 before each run.
-     * @param[in,out] l1_stack         Tensor with the L1 stack for each pixel. Data types supported: S32.
-     *                                 Expected to be initialized to 0 before each run.
-     * @param[in,out] l1_stack_counter Tensor for counting the elements in the L1 stack of each pixel. Data types supported: U8.
-     *                                              Expected to be initialized to 0 before each run.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr,
-                   ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;            /**< Source tensor. */
-    ICLTensor       *_output;           /**< Destination tensor. */
-    int32_t          _lower_thr;        /**< Lower threshold used for the hysteresis. */
-    int32_t          _upper_thr;        /**< Upper threshold used for the hysteresis. */
-    ICLTensor       *_visited;          /**< Marks visited elements */
-    ICLTensor       *_recorded;         /**< Marks recorded elements */
-    ICLTensor       *_l1_stack;         /**< L1 hysteris stack */
-    ICLTensor       *_l1_stack_counter; /**< L1 hysteris stack counter */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLCANNYEDGEKERNEL_H */
diff --git a/src/core/CL/kernels/CLChannelCombineKernel.cpp b/src/core/CL/kernels/CLChannelCombineKernel.cpp
deleted file mode 100644
index 52ba9dd..0000000
--- a/src/core/CL/kernels/CLChannelCombineKernel.cpp
+++ /dev/null
@@ -1,296 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLChannelCombineKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLMultiImage.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/MultiImageInfo.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <set>
-#include <string>
-
-namespace arm_compute
-{
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-} // namespace
-
-CLChannelCombineKernel::CLChannelCombineKernel()
-    : _planes{ { nullptr } }, _output(nullptr), _output_multi(nullptr), _x_subsampling{ { 1, 1, 1 } }, _y_subsampling{ { 1, 1, 1 } }
-{
-}
-
-void CLChannelCombineKernel::configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), plane0, plane1, plane2, plane3, output);
-}
-
-void CLChannelCombineKernel::configure(const CLCompileContext &compile_context, const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane0);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane1);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane2);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
-
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::RGB888, Format::RGBA8888, Format::YUYV422, Format::UYVY422);
-
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8);
-
-    const Format output_format = output->info()->format();
-
-    // Check if horizontal dimension of Y plane is even and validate horizontal sub-sampling dimensions for U and V planes
-    if(Format::YUYV422 == output_format || Format::UYVY422 == output_format)
-    {
-        // Validate Y plane of input and output
-        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(output_format, plane0, output);
-
-        // Validate U and V plane of the input
-        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), plane1, plane2);
-    }
-
-    _planes[0] = plane0;
-    _planes[1] = plane1;
-    _planes[2] = plane2;
-    _planes[3] = nullptr;
-
-    // Validate the last input tensor only for RGBA format
-    if(Format::RGBA8888 == output_format)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(plane3);
-        ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane3);
-
-        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane3, Format::U8);
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane3, 1, DataType::U8);
-
-        _planes[3] = plane3;
-    }
-
-    _output       = output;
-    _output_multi = nullptr;
-
-    // Half the processed elements for U and V channels due to horizontal sub-sampling of 2
-    if(Format::YUYV422 == output_format || Format::UYVY422 == output_format)
-    {
-        _x_subsampling[1] = 2;
-        _x_subsampling[2] = 2;
-    }
-
-    // Create kernel
-    std::string kernel_name = "channel_combine_" + string_from_format(output_format);
-    _kernel                 = create_kernel(compile_context, kernel_name);
-
-    // Configure window
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal plane0_access(plane0->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowRectangle  plane1_access(plane1->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
-    AccessWindowRectangle  plane2_access(plane2->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
-    AccessWindowHorizontal plane3_access(plane3 == nullptr ? nullptr : plane3->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, plane0_access, plane1_access, plane2_access, plane3_access, output_access);
-
-    ValidRegion valid_region = intersect_valid_regions(plane0->info()->valid_region(),
-                                                       plane1->info()->valid_region(),
-                                                       plane2->info()->valid_region());
-    if(plane3 != nullptr)
-    {
-        valid_region = intersect_valid_regions(plane3->info()->valid_region(), valid_region);
-    }
-    output_access.set_valid_region(win, ValidRegion(valid_region.anchor, output->info()->tensor_shape()));
-
-    ICLKernel::configure_internal(win);
-}
-
-void CLChannelCombineKernel::configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), plane0, plane1, plane2, output);
-}
-
-void CLChannelCombineKernel::configure(const CLCompileContext &compile_context, const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane0);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane1);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane2);
-
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
-
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8);
-
-    const Format output_format = output->info()->format();
-
-    // Validate shape of Y plane to be even and shape of sub-sampling dimensions for U and V planes
-    // Perform validation only for formats which require sub-sampling.
-    if(Format::YUV444 != output_format)
-    {
-        // Validate Y plane of input and output
-        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(output_format, plane0, output->plane(0));
-
-        // Validate U and V plane of the input
-        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), plane1, plane2);
-
-        // Validate second plane U (NV12 and NV21 have a UV88 combined plane while IYUV has only the U plane)
-        // MultiImage generates the correct tensor shape but also check in case the tensor shape of planes was changed to a wrong size
-        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), output->plane(1));
-
-        // Validate the last plane V of format IYUV
-        if(Format::IYUV == output_format)
-        {
-            // Validate Y plane of the output
-            ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), output->plane(2));
-        }
-    }
-
-    // Set input tensors
-    _planes[0] = plane0;
-    _planes[1] = plane1;
-    _planes[2] = plane2;
-    _planes[3] = nullptr;
-
-    // Set output tensor
-    _output       = nullptr;
-    _output_multi = output;
-
-    bool has_two_planars = false;
-
-    // Set sub-sampling parameters for each plane
-    std::string           kernel_name;
-    std::set<std::string> build_opts;
-
-    if(Format::NV12 == output_format || Format::NV21 == output_format)
-    {
-        _x_subsampling = { { 1, 2, 2 } };
-        _y_subsampling = { { 1, 2, 2 } };
-        kernel_name    = "channel_combine_NV";
-        build_opts.emplace(Format::NV12 == output_format ? "-DNV12" : "-DNV21");
-        has_two_planars = true;
-    }
-    else
-    {
-        if(Format::IYUV == output_format)
-        {
-            _x_subsampling = { { 1, 2, 2 } };
-            _y_subsampling = { { 1, 2, 2 } };
-        }
-
-        kernel_name = "copy_planes_3p";
-        build_opts.emplace(Format::IYUV == output_format ? "-DIYUV" : "-DYUV444");
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts);
-
-    // Configure window
-    Window win = calculate_max_window(*plane0->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowRectangle input_plane0_access(plane0->info(), 0, 0, num_elems_processed_per_iteration, 1.f);
-    AccessWindowRectangle input_plane1_access(plane1->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
-    AccessWindowRectangle input_plane2_access(plane2->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
-    AccessWindowRectangle output_plane0_access(output->plane(0)->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f, 1.f / _y_subsampling[1]);
-    AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
-    AccessWindowRectangle output_plane2_access(has_two_planars ? nullptr : output->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
-
-    update_window_and_padding(win,
-                              input_plane0_access, input_plane1_access, input_plane2_access,
-                              output_plane0_access, output_plane1_access, output_plane2_access);
-
-    ValidRegion plane0_valid_region  = plane0->info()->valid_region();
-    ValidRegion output_plane1_region = has_two_planars ? intersect_valid_regions(plane1->info()->valid_region(), plane2->info()->valid_region()) : plane2->info()->valid_region();
-    output_plane0_access.set_valid_region(win, ValidRegion(plane0_valid_region.anchor, output->plane(0)->info()->tensor_shape()));
-    output_plane1_access.set_valid_region(win, ValidRegion(output_plane1_region.anchor, output->plane(1)->info()->tensor_shape()));
-    output_plane2_access.set_valid_region(win, ValidRegion(plane2->info()->valid_region().anchor, output->plane(2)->info()->tensor_shape()));
-
-    ICLKernel::configure_internal(win);
-}
-
-void CLChannelCombineKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    slice.set_dimension_step(Window::DimY, 1);
-
-    do
-    {
-        // Subsampling in plane 1
-        Window win_sub_plane1(slice);
-        win_sub_plane1.set(Window::DimX, Window::Dimension(win_sub_plane1.x().start() / _x_subsampling[1], win_sub_plane1.x().end() / _x_subsampling[1], win_sub_plane1.x().step() / _x_subsampling[1]));
-        win_sub_plane1.set(Window::DimY, Window::Dimension(win_sub_plane1.y().start() / _y_subsampling[1], win_sub_plane1.y().end() / _y_subsampling[1], 1));
-
-        // Subsampling in plane 2
-        Window win_sub_plane2(slice);
-        win_sub_plane2.set(Window::DimX, Window::Dimension(win_sub_plane2.x().start() / _x_subsampling[2], win_sub_plane2.x().end() / _x_subsampling[2], win_sub_plane2.x().step() / _x_subsampling[2]));
-        win_sub_plane2.set(Window::DimY, Window::Dimension(win_sub_plane2.y().start() / _y_subsampling[2], win_sub_plane2.y().end() / _y_subsampling[2], 1));
-
-        unsigned int idx = 0;
-
-        // Set inputs
-        add_2D_tensor_argument(idx, _planes[0], slice);
-        add_2D_tensor_argument(idx, _planes[1], win_sub_plane1);
-        add_2D_tensor_argument(idx, _planes[2], win_sub_plane2);
-        add_2D_tensor_argument_if((nullptr != _planes[3]), idx, _planes[3], slice);
-
-        // Set outputs
-        if(nullptr != _output) // Single planar output
-        {
-            add_2D_tensor_argument(idx, _output, slice);
-        }
-        else // Multi-planar output
-        {
-            // Reduce slice in case of subsampling to avoid out-of bounds access
-            slice.set(Window::DimY, Window::Dimension(slice.y().start() / _y_subsampling[1], slice.y().end() / _y_subsampling[1], 1));
-
-            add_2D_tensor_argument(idx, _output_multi->cl_plane(0), slice);
-            add_2D_tensor_argument(idx, _output_multi->cl_plane(1), win_sub_plane1);
-            add_2D_tensor_argument_if((3 == num_planes_from_format(_output_multi->info()->format())), idx, _output_multi->cl_plane(2), win_sub_plane2);
-
-            _kernel.setArg(idx++, slice.y().end());
-        }
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLChannelCombineKernel.h b/src/core/CL/kernels/CLChannelCombineKernel.h
deleted file mode 100644
index f19995a..0000000
--- a/src/core/CL/kernels/CLChannelCombineKernel.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCHANNELCOMBINEKERNEL_H
-#define ARM_COMPUTE_CLCHANNELCOMBINEKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-#include <array>
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLMultiImage;
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface for the channel combine kernel */
-class CLChannelCombineKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLChannelCombineKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLChannelCombineKernel(const CLChannelCombineKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLChannelCombineKernel &operator=(const CLChannelCombineKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLChannelCombineKernel(CLChannelCombineKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLChannelCombineKernel &operator=(CLChannelCombineKernel &&) = default;
-    /** Default destructor */
-    ~CLChannelCombineKernel() = default;
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  plane0 The 2D plane that forms channel 0. Must be of U8 format.
-     * @param[in]  plane1 The 2D plane that forms channel 1. Must be of U8 format.
-     * @param[in]  plane2 The 2D plane that forms channel 2. Must be of U8 format.
-     * @param[in]  plane3 The 2D plane that forms channel 3. Must be of U8 format.
-     * @param[out] output The single planar output tensor. Supported formats: RGB888/RGBA8888/YUYV422/UYVY422.
-     */
-    void configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output);
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  plane0          The 2D plane that forms channel 0. Must be of U8 format.
-     * @param[in]  plane1          The 2D plane that forms channel 1. Must be of U8 format.
-     * @param[in]  plane2          The 2D plane that forms channel 2. Must be of U8 format.
-     * @param[in]  plane3          The 2D plane that forms channel 3. Must be of U8 format.
-     * @param[out] output          The single planar output tensor.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output);
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  plane0 The 2D plane that forms channel 0. Must be of U8 format.
-     * @param[in]  plane1 The 2D plane that forms channel 1. Must be of U8 format.
-     * @param[in]  plane2 The 2D plane that forms channel 2. Must be of U8 format.
-     * @param[out] output The multi planar output tensor. Supported formats: RGB888/RGBA8888/YUYV422/UYVY422.
-     */
-    void configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output);
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  plane0          The 2D plane that forms channel 0. Must be of U8 format.
-     * @param[in]  plane1          The 2D plane that forms channel 1. Must be of U8 format.
-     * @param[in]  plane2          The 2D plane that forms channel 2. Must be of U8 format.
-     * @param[out] output          The multi planar output tensor. Supported formats: RGB888/RGBA8888/YUYV422/UYVY422.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    std::array<const ICLTensor *, 4> _planes;
-    ICLTensor     *_output;
-    ICLMultiImage *_output_multi;
-    std::array<uint32_t, 3> _x_subsampling;
-    std::array<uint32_t, 3> _y_subsampling;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLCHANNELCOMBINEKERNEL_H */
diff --git a/src/core/CL/kernels/CLChannelExtractKernel.cpp b/src/core/CL/kernels/CLChannelExtractKernel.cpp
deleted file mode 100644
index cbf504b..0000000
--- a/src/core/CL/kernels/CLChannelExtractKernel.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLChannelExtractKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLMultiImage.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/MultiImageInfo.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLChannelExtractKernel::CLChannelExtractKernel()
-    : _input(nullptr), _output(nullptr), _num_elems_processed_per_iteration(8), _subsampling(1)
-{
-}
-
-void CLChannelExtractKernel::configure(const ICLTensor *input, Channel channel, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, channel, output);
-}
-
-void CLChannelExtractKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, Channel channel, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_ON(input == output);
-
-    set_format_if_unknown(*output->info(), Format::U8);
-
-    // Check if input tensor has a valid format
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::RGB888, Format::RGBA8888, Format::YUYV422, Format::UYVY422);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
-
-    // Check if channel is valid for given format
-    const Format format = input->info()->format();
-    ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(format, channel);
-
-    // Half the processed elements for U,V channels due to sub-sampling of 2
-    _subsampling = 1;
-
-    if(format == Format::YUYV422 || format == Format::UYVY422)
-    {
-        // Check if the width of the tensor shape is even for formats with subsampled channels (UYVY422 and YUYV422)
-        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(format, input);
-
-        if(channel != Channel::Y)
-        {
-            _subsampling = 2;
-        }
-    }
-
-    // Calculate output tensor shape using subsampling
-    TensorShape output_shape = calculate_subsampled_shape(input->info()->tensor_shape(), format, channel);
-    set_shape_if_empty(*output->info(), output_shape);
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-
-    _input  = input;
-    _output = output;
-
-    // Create kernel
-    std::string           kernel_name = "channel_extract_" + string_from_format(format);
-    std::set<std::string> build_opts  = { ("-DCHANNEL_" + string_from_channel(channel)) };
-    _kernel                           = create_kernel(compile_context, kernel_name, build_opts);
-
-    // Configure window
-    Window                 win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input->info(), 0, _num_elems_processed_per_iteration);
-    AccessWindowRectangle  output_access(output->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _subsampling, 1.f / _subsampling);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    ValidRegion input_valid_region = input->info()->valid_region();
-    output_access.set_valid_region(win, ValidRegion(input_valid_region.anchor, output->info()->tensor_shape()));
-
-    ICLKernel::configure_internal(win);
-}
-
-void CLChannelExtractKernel::configure(const ICLMultiImage *input, Channel channel, ICLImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, channel, output);
-}
-
-void CLChannelExtractKernel::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, Channel channel, ICLImage *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
-
-    set_format_if_unknown(*output->info(), Format::U8);
-
-    // Check if channel is valid for given format
-    const Format format = input->info()->format();
-    ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(format, channel);
-
-    // Get input plane from the given channel
-    const ICLImage *input_plane = input->cl_plane(plane_idx_from_channel(format, channel));
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input_plane);
-
-    if(Channel::Y == channel && format != Format::YUV444)
-    {
-        // Check if the width of the tensor shape is even for formats with subsampled channels (UYVY422 and YUYV422)
-        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(format, input_plane);
-    }
-
-    // Calculate 2x2 subsampled tensor shape
-    TensorShape output_shape = calculate_subsampled_shape(input->cl_plane(0)->info()->tensor_shape(), format, channel);
-    set_shape_if_empty(*output->info(), output_shape);
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output_shape, output->info()->tensor_shape());
-
-    // Check if input tensor has a valid format
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
-
-    _output      = output;
-    _input       = input_plane;
-    _subsampling = 1;
-
-    // Create kernel
-    std::string           kernel_name;
-    std::set<std::string> build_opts;
-    if(Channel::Y == channel || Format::IYUV == format || Format::YUV444 == format)
-    {
-        kernel_name = "copy_plane";
-    }
-    else
-    {
-        kernel_name = "channel_extract_" + string_from_format(format);
-        build_opts.insert(("-DCHANNEL_" + string_from_channel(channel)));
-    }
-    _kernel = create_kernel(compile_context, kernel_name, build_opts);
-
-    // Configure window
-    Window                 win = calculate_max_window(*input_plane->info(), Steps(_num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input_plane->info(), 0, _num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input_plane->info()->valid_region());
-
-    ICLKernel::configure_internal(win);
-}
-
-void CLChannelExtractKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-
-    do
-    {
-        Window win_sub(slice);
-        win_sub.set(Window::DimX, Window::Dimension(win_sub.x().start() / _subsampling, win_sub.x().end() / _subsampling, win_sub.x().step() / _subsampling));
-        win_sub.set(Window::DimY, Window::Dimension(win_sub.y().start() / _subsampling, win_sub.y().end() / _subsampling, 1));
-
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        add_2D_tensor_argument(idx, _output, win_sub);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLChannelExtractKernel.h b/src/core/CL/kernels/CLChannelExtractKernel.h
deleted file mode 100644
index 37abde5..0000000
--- a/src/core/CL/kernels/CLChannelExtractKernel.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCHANNELEXTRACTKERNEL_H
-#define ARM_COMPUTE_CLCHANNELEXTRACTKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLMultiImage;
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface for the channel extract kernel */
-class CLChannelExtractKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLChannelExtractKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLChannelExtractKernel(const CLChannelExtractKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLChannelExtractKernel &operator=(const CLChannelExtractKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLChannelExtractKernel(CLChannelExtractKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLChannelExtractKernel &operator=(CLChannelExtractKernel &&) = default;
-    /** Default destructor */
-    ~CLChannelExtractKernel() = default;
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input   Source tensor. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422
-     * @param[in]  channel Channel to extract.
-     * @param[out] output  Destination tensor. Must be of U8 format.
-     */
-    void configure(const ICLTensor *input, Channel channel, ICLTensor *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422
-     * @param[in]  channel         Channel to extract.
-     * @param[out] output          Destination tensor. Must be of U8 format.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, Channel channel, ICLTensor *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input   Multi-planar source image. Formats supported: NV12/NV21/IYUV/YUV444
-     * @param[in]  channel Channel to extract.
-     * @param[out] output  Single-planar 2D destination image. Must be of U8 format.
-     */
-    void configure(const ICLMultiImage *input, Channel channel, ICLImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Multi-planar source image. Formats supported: NV12/NV21/IYUV/YUV444
-     * @param[in]  channel         Channel to extract.
-     * @param[out] output          Single-planar 2D destination image. Must be of U8 format.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLMultiImage *input, Channel channel, ICLImage *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    uint32_t         _num_elems_processed_per_iteration;
-    uint32_t         _subsampling;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLCHANNELEXTRACTKERNEL_H */
diff --git a/src/core/CL/kernels/CLColorConvertKernel.cpp b/src/core/CL/kernels/CLColorConvertKernel.cpp
deleted file mode 100644
index 6c61fec..0000000
--- a/src/core/CL/kernels/CLColorConvertKernel.cpp
+++ /dev/null
@@ -1,558 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLColorConvertKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLMultiImage.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/MultiImageInfo.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <sstream>
-
-using namespace arm_compute;
-
-CLColorConvertKernel::CLColorConvertKernel()
-    : _input(nullptr), _output(nullptr), _multi_input(nullptr), _multi_output(nullptr)
-{
-}
-
-void CLColorConvertKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLColorConvertKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON(input == nullptr);
-    ARM_COMPUTE_ERROR_ON(output == nullptr);
-
-    unsigned int num_elems_processed_per_iteration = 0;
-    switch(input->info()->format())
-    {
-        case Format::RGBA8888:
-        {
-            switch(output->info()->format())
-            {
-                case Format::RGB888:
-                    num_elems_processed_per_iteration = 16;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        case Format::UYVY422:
-        case Format::YUYV422:
-        {
-            switch(output->info()->format())
-            {
-                case Format::RGB888:
-                case Format::RGBA8888:
-                    num_elems_processed_per_iteration = 8;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        case Format::RGB888:
-        {
-            switch(output->info()->format())
-            {
-                case Format::RGBA8888:
-                case Format::U8:
-                    num_elems_processed_per_iteration = 16;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        default:
-            break;
-    }
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
-                                 string_from_format(input->info()->format()).c_str(),
-                                 string_from_format(output->info()->format()).c_str());
-
-    std::stringstream kernel_name;
-
-    kernel_name << string_from_format(input->info()->format());
-    kernel_name << "_to_";
-    kernel_name << string_from_format(output->info()->format());
-    kernel_name << "_bt709";
-
-    _input  = input;
-    _output = output;
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name.str());
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name.str();
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-void CLColorConvertKernel::configure(const ICLMultiImage *input, ICLImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLColorConvertKernel::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLImage *output)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
-    ARM_COMPUTE_ERROR_ON(output == nullptr);
-
-    unsigned int num_elems_processed_per_iteration = 0;
-
-    switch(input->info()->format())
-    {
-        case Format::NV12:
-        case Format::NV21:
-        case Format::IYUV:
-        {
-            switch(output->info()->format())
-            {
-                case Format::RGB888:
-                case Format::RGBA8888:
-                    num_elems_processed_per_iteration = 4;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        default:
-            break;
-    }
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
-                                 string_from_format(input->info()->format()).c_str(),
-                                 string_from_format(output->info()->format()).c_str());
-
-    std::stringstream kernel_name;
-
-    kernel_name << string_from_format(input->info()->format());
-    kernel_name << "_to_";
-    kernel_name << string_from_format(output->info()->format());
-    kernel_name << "_bt709";
-
-    _multi_input = input;
-    _output      = output;
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name.str());
-
-    // Configure kernel window
-    const bool  has_two_planes = (input->info()->format() == Format::NV12) || (input->info()->format() == Format::NV21);
-    const float sub_sampling   = (has_two_planes || (input->info()->format() == Format::IYUV)) ? 0.5f : 1;
-
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-    win.set_dimension_step(Window::DimY, 2);
-
-    AccessWindowHorizontal plane0_access(input->plane(0)->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowRectangle  plane1_access(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1,
-                                         sub_sampling, sub_sampling);
-    AccessWindowRectangle plane2_access(has_two_planes ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1,
-                                        sub_sampling, sub_sampling);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win,
-                              plane0_access, plane1_access, plane2_access,
-                              output_access);
-
-    ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(), input->plane(1)->info()->valid_region(),
-                                                           input->plane(2)->info()->valid_region());
-    output_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->info()->tensor_shape()));
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name.str();
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->plane(0)->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->plane(0)->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->plane(0)->info()->dimension(1));
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->plane(1)->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->plane(1)->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->plane(1)->info()->dimension(1));
-}
-
-void CLColorConvertKernel::configure(const ICLImage *input, ICLMultiImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLColorConvertKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLMultiImage *output)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON(output == nullptr);
-
-    unsigned int num_elems_processed_per_iteration = 0;
-    unsigned int num_elems_read_per_iteration_x    = 0;
-
-    bool  has_two_planes = (output->info()->format() == Format::NV12) || (output->info()->format() == Format::NV21);
-    float sub_sampling   = (has_two_planes || (output->info()->format() == Format::IYUV)) ? 0.5f : 1;
-
-    switch(input->info()->format())
-    {
-        case Format::RGB888:
-        case Format::RGBA8888:
-        {
-            switch(output->info()->format())
-            {
-                case Format::NV12:
-                case Format::IYUV:
-                    num_elems_processed_per_iteration = 2;
-                    num_elems_read_per_iteration_x    = 8;
-                    break;
-                case Format::YUV444:
-                    num_elems_processed_per_iteration = 4;
-                    num_elems_read_per_iteration_x    = 16;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        case Format::UYVY422:
-        case Format::YUYV422:
-        {
-            switch(output->info()->format())
-            {
-                case Format::NV12:
-                case Format::IYUV:
-                    num_elems_processed_per_iteration = 8;
-                    num_elems_read_per_iteration_x    = 8;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        default:
-            break;
-    }
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
-                                 string_from_format(input->info()->format()).c_str(),
-                                 string_from_format(output->info()->format()).c_str());
-
-    std::stringstream kernel_name;
-
-    kernel_name << string_from_format(input->info()->format());
-    kernel_name << "_to_";
-    kernel_name << string_from_format(output->info()->format());
-    kernel_name << "_bt709";
-    _input        = input;
-    _multi_output = output;
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name.str());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    if((input->info()->format() != Format::RGB888 || output->info()->format() != Format::YUV444) && (input->info()->format() != Format::RGBA8888 || output->info()->format() != Format::YUV444))
-    {
-        win.set_dimension_step(Window::DimY, 2);
-    }
-
-    AccessWindowHorizontal output_plane0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowRectangle  output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling);
-    AccessWindowRectangle  output_plane2_access(has_two_planes ? nullptr : output->plane(2)->info(), 0, 0,
-                                                num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling);
-
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_read_per_iteration_x);
-
-    update_window_and_padding(win,
-                              input_access,
-                              output_plane0_access,
-                              output_plane1_access,
-                              output_plane2_access);
-
-    ValidRegion input_region = input->info()->valid_region();
-
-    output_plane0_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(0)->info()->tensor_shape()));
-    output_plane1_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(1)->info()->tensor_shape()));
-    output_plane2_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(2)->info()->tensor_shape()));
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name.str();
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-void CLColorConvertKernel::configure(const ICLMultiImage *input, ICLMultiImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLColorConvertKernel::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLMultiImage *output)
-{
-    unsigned int num_elems_processed_per_iteration = 0;
-    switch(input->info()->format())
-    {
-        case Format::NV12:
-        case Format::NV21:
-        {
-            switch(output->info()->format())
-            {
-                case Format::IYUV:
-                case Format::YUV444:
-                    num_elems_processed_per_iteration = 16;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        case Format::IYUV:
-        {
-            switch(output->info()->format())
-            {
-                case Format::YUV444:
-                case Format::NV12:
-                    num_elems_processed_per_iteration = 16;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        default:
-            break;
-    }
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
-                                 string_from_format(input->info()->format()).c_str(),
-                                 string_from_format(output->info()->format()).c_str());
-
-    std::stringstream kernel_name;
-
-    kernel_name << string_from_format(input->info()->format());
-    kernel_name << "_to_";
-    kernel_name << string_from_format(output->info()->format());
-    kernel_name << "_bt709";
-
-    _multi_input  = input;
-    _multi_output = output;
-
-    // Create kernel
-    bool has_two_input_planars  = (input->info()->format() == Format::NV12) || (input->info()->format() == Format::NV21);
-    bool has_two_output_planars = (output->info()->format() == Format::NV12) || (output->info()->format() == Format::NV21);
-
-    float sub_sampling_input  = (has_two_input_planars || (input->info()->format() == Format::IYUV)) ? 0.5f : 1;
-    float sub_sampling_output = (has_two_output_planars || (output->info()->format() == Format::IYUV)) ? 0.5f : 1;
-
-    _kernel = create_kernel(compile_context, kernel_name.str());
-
-    Window win = calculate_max_window(*input->cl_plane(0)->info(), Steps(num_elems_processed_per_iteration));
-    win.set_dimension_step(Window::DimY, 2);
-
-    AccessWindowHorizontal input_plane0_access(input->plane(0)->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowRectangle  input_plane1_access(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1,
-                                               sub_sampling_input, sub_sampling_input);
-    AccessWindowRectangle input_plane2_access(has_two_input_planars ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1,
-                                              sub_sampling_input, sub_sampling_input);
-    AccessWindowHorizontal output_plane0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowRectangle  output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling_output, sub_sampling_output);
-    AccessWindowRectangle  output_plane2_access(has_two_output_planars ? nullptr : output->plane(2)->info(), 0, 0,
-                                                num_elems_processed_per_iteration, 1, sub_sampling_output, sub_sampling_output);
-
-    update_window_and_padding(win,
-                              input_plane0_access, input_plane1_access, input_plane2_access,
-                              output_plane0_access, output_plane1_access, output_plane2_access);
-
-    ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(), input->plane(1)->info()->valid_region(),
-                                                           input->plane(2)->info()->valid_region());
-    output_plane0_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(0)->info()->tensor_shape()));
-    output_plane1_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(1)->info()->tensor_shape()));
-    output_plane2_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(2)->info()->tensor_shape()));
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name.str();
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->plane(0)->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->plane(0)->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->plane(0)->info()->dimension(1));
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->plane(1)->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->plane(1)->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->plane(1)->info()->dimension(1));
-}
-
-void CLColorConvertKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-
-    if(nullptr != _input && nullptr != _output)
-    {
-        do
-        {
-            unsigned int idx = 0;
-            add_2D_tensor_argument(idx, _input, slice);
-            add_2D_tensor_argument(idx, _output, slice);
-            enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window.slide_window_slice_2D(slice));
-    }
-    else if(nullptr != _input && nullptr != _multi_output)
-    {
-        Format format = _multi_output->info()->format();
-        do
-        {
-            Window win_uv(slice);
-
-            if((Format::NV12 == format) || (Format::NV21 == format) || (Format::IYUV == format))
-            {
-                win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
-                win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
-            }
-            unsigned int idx = 0;
-            add_2D_tensor_argument(idx, _input, slice);
-            add_2D_tensor_argument(idx, _multi_output->cl_plane(0), slice);
-            for(int i = 1; i < 3 && (0 != _multi_output->cl_plane(i)->info()->num_dimensions()); ++i)
-            {
-                add_2D_tensor_argument(idx, _multi_output->cl_plane(i), win_uv);
-            }
-            enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window.slide_window_slice_2D(slice));
-    }
-    else if(nullptr != _multi_input && nullptr != _output)
-    {
-        Format format = _multi_input->info()->format();
-        do
-        {
-            Window win_uv(slice);
-
-            if((Format::NV12 == format) || (Format::NV21 == format) || (Format::IYUV == format))
-            {
-                win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
-                win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
-            }
-
-            unsigned int idx = 0;
-            add_2D_tensor_argument(idx, _multi_input->cl_plane(0), slice);
-
-            for(int i = 1; i < 3 && (0 != _multi_input->cl_plane(i)->info()->num_dimensions()); ++i)
-            {
-                add_2D_tensor_argument(idx, _multi_input->cl_plane(i), win_uv);
-            }
-            add_2D_tensor_argument(idx, _output, slice);
-            enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window.slide_window_slice_2D(slice));
-    }
-    else if(nullptr != _multi_input && nullptr != _multi_output)
-    {
-        Format in_format  = _multi_input->info()->format();
-        Format out_format = _multi_output->info()->format();
-        do
-        {
-            Window win_in_uv(slice);
-            if((Format::NV12 == in_format) || (Format::NV21 == in_format) || (Format::IYUV == in_format))
-            {
-                win_in_uv.set(Window::DimX, Window::Dimension(win_in_uv.x().start() / 2,
-                                                              win_in_uv.x().end() / 2, win_in_uv.x().step() / 2));
-                win_in_uv.set(Window::DimY, Window::Dimension(win_in_uv.y().start() / 2, win_in_uv.y().end() / 2, 1));
-            }
-            unsigned int idx = 0;
-            add_2D_tensor_argument(idx, _multi_input->cl_plane(0), slice);
-            for(int i = 1; i < 3 && (0 != _multi_input->cl_plane(i)->info()->num_dimensions()); ++i)
-            {
-                add_2D_tensor_argument(idx, _multi_input->cl_plane(i), win_in_uv);
-            }
-
-            Window win_out_uv(slice);
-            if((Format::NV12 == out_format) || (Format::NV21 == out_format) || (Format::IYUV == out_format))
-            {
-                win_out_uv.set(Window::DimX, Window::Dimension(win_out_uv.x().start() / 2,
-                                                               win_out_uv.x().end() / 2, win_out_uv.x().step() / 2));
-                win_out_uv.set(Window::DimY, Window::Dimension(win_out_uv.y().start() / 2, win_out_uv.y().end() / 2, 1));
-            }
-
-            add_2D_tensor_argument(idx, _multi_output->cl_plane(0), slice);
-            for(int i = 1; i < 3 && (0 != _multi_output->cl_plane(i)->info()->num_dimensions()); ++i)
-            {
-                add_2D_tensor_argument(idx, _multi_output->cl_plane(i), win_out_uv);
-            }
-            enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window.slide_window_slice_2D(slice));
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not supported");
-    }
-}
diff --git a/src/core/CL/kernels/CLColorConvertKernel.h b/src/core/CL/kernels/CLColorConvertKernel.h
deleted file mode 100644
index 0f08291..0000000
--- a/src/core/CL/kernels/CLColorConvertKernel.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCOLORCONVERTKERNEL_H
-#define ARM_COMPUTE_CLCOLORCONVERTKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLMultiImage;
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface for the color convert kernel.
- *
- */
-class CLColorConvertKernel : public ICLKernel
-{
-public:
-    /** Default constructor. */
-    CLColorConvertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLColorConvertKernel(const CLColorConvertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLColorConvertKernel &operator=(const CLColorConvertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLColorConvertKernel(CLColorConvertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLColorConvertKernel &operator=(CLColorConvertKernel &&) = default;
-    /** Default destructor. */
-    ~CLColorConvertKernel() = default;
-
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888
-     * @param[out] output Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422),
-     *                                                          RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/),
-     *                                                          U8 (if the formats of @p input is RGB888)
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888
-     * @param[out] output          Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422),
-     *                                                          RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/),
-     *                                                          U8 (if the formats of @p input is RGB888)
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output Single-planar destination image. Formats supported: RGB888/RGBA8888
-     */
-    void configure(const ICLMultiImage *input, ICLImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output          Single-planar destination image. Formats supported: RGB888/RGBA8888
-     */
-    void configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
-     * @param[out] output Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888)
-     */
-    void configure(const ICLImage *input, ICLMultiImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
-     * @param[out] output          Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888)
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLMultiImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of  @p input is IYUV)
-     */
-    void configure(const ICLMultiImage *input, ICLMultiImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output          Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of  @p input is IYUV)
-     */
-    void configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLMultiImage *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor     *_input;        /*pointer to single planar tensor input */
-    ICLTensor           *_output;       /*pointer to single planar tensor output */
-    const ICLMultiImage *_multi_input;  /*pointer to multi-planar input */
-    ICLMultiImage       *_multi_output; /*pointer to multi-planar output */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLCOLORCONVERTKERNEL_H */
diff --git a/src/core/CL/kernels/CLConvolutionKernel.cpp b/src/core/CL/kernels/CLConvolutionKernel.cpp
deleted file mode 100644
index 21f1047..0000000
--- a/src/core/CL/kernels/CLConvolutionKernel.cpp
+++ /dev/null
@@ -1,392 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLConvolutionKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <sstream>
-#include <string>
-
-namespace arm_compute
-{
-namespace
-{
-constexpr unsigned int max_matrix_size = 81;
-} // namespace
-
-/****************************************************************************************\
- *                                 Square Convolution                                *
-\****************************************************************************************/
-
-template <unsigned int matrix_size>
-BorderSize             CLConvolutionKernel<matrix_size>::border_size() const
-{
-    return BorderSize(matrix_size / 2);
-}
-
-template <unsigned int matrix_size>
-void CLConvolutionKernel<matrix_size>::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, scale, border_undefined);
-}
-
-template <unsigned int matrix_size>
-void CLConvolutionKernel<matrix_size>::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON(conv == nullptr);
-
-    _input  = input;
-    _output = output;
-
-    std::stringstream kernel_name;
-    CLBuildOptions    build_opts;
-    kernel_name << "convolution" << matrix_size << "x" << matrix_size << "_static";
-
-    if(scale == 0)
-    {
-        scale = calculate_matrix_scale(conv, matrix_size);
-    }
-
-    for(unsigned int i = 0; i < matrix_size * matrix_size; i++)
-    {
-        std::stringstream mat_str;
-        mat_str << "-DMAT" << i << "=" << conv[i];
-        build_opts.add_option(mat_str.str());
-    }
-
-    build_opts.add_option("-DSCALE=" + support::cpp11::to_string(scale));
-
-    DataType data_type = data_type_for_convolution_matrix(conv, matrix_size * matrix_size);
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-
-    std::stringstream out_type;
-    out_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type());
-    build_opts.add_option(out_type.str());
-
-    _kernel = create_kernel(compile_context, kernel_name.str(), build_opts.options());
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_rows_read_per_iteration       = matrix_size;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-}
-
-/****************************************************************************************\
- *                                 Separable Convolution                                *
-\****************************************************************************************/
-template <unsigned int matrix_size>
-CLSeparableConvolutionHorKernel<matrix_size>::CLSeparableConvolutionHorKernel()
-    : _border_size(0)
-{
-}
-
-template <unsigned int matrix_size>
-BorderSize             CLSeparableConvolutionHorKernel<matrix_size>::border_size() const
-{
-    return _border_size;
-}
-
-template <unsigned int matrix_size>
-void CLSeparableConvolutionHorKernel<matrix_size>::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, border_undefined);
-}
-
-template <unsigned int matrix_size>
-void CLSeparableConvolutionHorKernel<matrix_size>::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::S16, DataType::S32);
-
-    ARM_COMPUTE_ERROR_ON((matrix_size != 5) && (matrix_size != 7) && (matrix_size != 9));
-
-    _input       = input;
-    _output      = output;
-    _border_size = BorderSize(border_undefined ? 0 : matrix_size / 2, matrix_size / 2);
-
-    // Set build options
-    std::set<std::string> build_opts;
-
-    std::array<int16_t, matrix_size *matrix_size> mat = { 0 };
-    memcpy(mat.data(), conv, matrix_size * sizeof(int16_t));
-
-    for(unsigned int j = 0; j < matrix_size * matrix_size; j++)
-    {
-        build_opts.insert("-DMAT" + support::cpp11::to_string(j) + "=" + support::cpp11::to_string(mat[j]));
-    }
-
-    build_opts.insert("-DSCALE=0");
-
-    build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
-
-    // Create kernel
-    const std::string kernel_name = "convolution_separable1x" + support::cpp11::to_string(matrix_size) + "_static";
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-
-    Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_undefined);
-}
-
-template <unsigned int matrix_size>
-BorderSize             CLSeparableConvolutionVertKernel<matrix_size>::border_size() const
-{
-    return BorderSize{ matrix_size / 2, 0 };
-}
-
-template <unsigned int matrix_size>
-void CLSeparableConvolutionVertKernel<matrix_size>::configure(const ICLTensor *input, ICLTensor *output,
-                                                              const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, scale, border_undefined, data_type);
-}
-
-template <unsigned int matrix_size>
-void CLSeparableConvolutionVertKernel<matrix_size>::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
-                                                              const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON((matrix_size != 5) && (matrix_size != 7) && (matrix_size != 9));
-    ARM_COMPUTE_ERROR_ON(scale == 0);
-
-    _input  = input;
-    _output = output;
-
-    std::set<std::string> build_opts;
-
-    std::array<int16_t, matrix_size *matrix_size> mat = { 0 };
-    memcpy(mat.data() + matrix_size, conv, matrix_size * sizeof(int16_t));
-
-    for(unsigned int j = 0; j < matrix_size * matrix_size; j++)
-    {
-        build_opts.insert("-DMAT" + support::cpp11::to_string(j) + "=" + support::cpp11::to_string(mat[j]));
-    }
-
-    build_opts.insert("-DSCALE=" + support::cpp11::to_string(scale));
-
-    build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-
-    build_opts.insert("-DCOMPUTE_TYPE=" + get_cl_type_from_data_type(data_type));
-
-    std::stringstream out_type;
-    out_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type());
-    build_opts.insert(out_type.str());
-
-    // Create kernel
-    const std::string kernel_name = "convolution_separable" + support::cpp11::to_string(matrix_size) + "x1_static";
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 8;
-    constexpr unsigned int num_rows_read_per_iteration       = matrix_size;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_access(input->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(data_type));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_undefined);
-}
-
-/****************************************************************************************\
- *                                 Rectangle Convolution                                *
-\****************************************************************************************/
-
-CLConvolutionRectangleKernel::CLConvolutionRectangleKernel()
-    : _border_size(0), _input(nullptr), _output(nullptr)
-{
-}
-
-BorderSize CLConvolutionRectangleKernel::border_size() const
-{
-    return _border_size;
-}
-
-void CLConvolutionRectangleKernel::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, width, height, scale, border_undefined);
-}
-
-void CLConvolutionRectangleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale,
-                                             bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON(nullptr == conv);
-    ARM_COMPUTE_ERROR_ON(3 != width && 5 != width && 7 != width && 9 != width);
-    ARM_COMPUTE_ERROR_ON(3 != height && 5 != height && 7 != height && 9 != height);
-    ARM_COMPUTE_ERROR_ON(0 == scale);
-
-    _input       = input;
-    _output      = output;
-    _border_size = BorderSize(height / 2, width / 2);
-
-    std::set<std::string> options;
-
-    std::stringstream output_type;
-    output_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type());
-    options.insert(output_type.str());
-
-    uint32_t matrix_size = width * height;
-
-    std::array<int16_t, max_matrix_size> mat = { 0 };
-
-    memcpy(mat.data(), conv, matrix_size * sizeof(int16_t));
-
-    for(unsigned int j = 0; j < max_matrix_size; j++)
-    {
-        options.insert("-DMAT" + support::cpp11::to_string(j) + "=" + support::cpp11::to_string(mat[j]));
-    }
-
-    options.insert("-DSCALE=" + support::cpp11::to_string(scale));
-
-    DataType data_type = data_type_for_convolution_matrix(conv, matrix_size);
-    options.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-
-    options.insert("-DMATRIX_WIDTH=" + support::cpp11::to_string(width));
-    options.insert("-DMATRIX_HEIGHT=" + support::cpp11::to_string(height));
-
-    _kernel = create_kernel(compile_context, "convolution_rectangle", options);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    const unsigned int     num_rows_read_per_iteration       = height;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-}
-
-void CLConvolutionRectangleKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        add_2D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
-
-template class arm_compute::CLConvolutionKernel<3>;
-template class arm_compute::CLConvolutionKernel<5>;
-template class arm_compute::CLConvolutionKernel<7>;
-template class arm_compute::CLConvolutionKernel<9>;
-template class arm_compute::CLSeparableConvolutionVertKernel<5>;
-template class arm_compute::CLSeparableConvolutionVertKernel<7>;
-template class arm_compute::CLSeparableConvolutionVertKernel<9>;
-template class arm_compute::CLSeparableConvolutionHorKernel<5>;
-template class arm_compute::CLSeparableConvolutionHorKernel<7>;
-template class arm_compute::CLSeparableConvolutionHorKernel<9>;
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLConvolutionKernel.h b/src/core/CL/kernels/CLConvolutionKernel.h
deleted file mode 100644
index 33e73ca..0000000
--- a/src/core/CL/kernels/CLConvolutionKernel.h
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCONVOLUTIONKERNEL_H
-#define ARM_COMPUTE_CLCONVOLUTIONKERNEL_H
-
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/****************************************************************************************\
- *                                    Square Convolution                                *
-\****************************************************************************************/
-
-/** Interface for the kernel to run an arbitrary size convolution on a tensor. (Currently supports 3x3, 5x5, 7x7 and 9x9).
- * The client can supply a convolution matrix \f$ C_{m,n} \f$.
- * @f{eqnarray}{
- *  k_0 &=& \frac{m}{2}  \\
- *  l_0 &=& \frac{n}{2}  \\
- *  sum &=& \sum_{k=0,l=0}^{k=m-1,l=n-1} input(x+k-k_0, y+l-l_0) C_{k,l}
- *  @f}
- *
- * @note The above equation for this function is similar to the default OpenCV Filter2D function,
- *       which actually computes a correlation and not a convolution.
- *       In case of a real convolution the convolution matrix should be flipped both horizontally and vertically.
- */
-template <unsigned int matrix_size>
-class CLConvolutionKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined);
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-
-/** Interface for the kernel which applies a 3x3 convolution to a tensor. */
-using CLConvolution3x3Kernel = CLConvolutionKernel<3>;
-/** Interface for the kernel which applies a 5x5 convolution to a tensor. */
-using CLConvolution5x5Kernel = CLConvolutionKernel<5>;
-/** Interface for the kernel which applies a 7x7 convolution to a tensor. */
-using CLConvolution7x7Kernel = CLConvolutionKernel<7>;
-/** Interface for the kernel which applies a 9x9 convolution to a tensor. */
-using CLConvolution9x9Kernel = CLConvolutionKernel<9>;
-
-/****************************************************************************************\
- *                              Separable Square Convolution                            *
-\****************************************************************************************/
-
-/** Kernel for the Horizontal pass of a Separable Convolution. Currently support 5x5, 7x7, 9x9 */
-template <unsigned int matrix_size>
-class CLSeparableConvolutionHorKernel : public ICLSimple2DKernel
-{
-public:
-    /** Default Constructor */
-    CLSeparableConvolutionHorKernel();
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined);
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: U16/S16/S32.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-
-private:
-    BorderSize _border_size; /**< Border size */
-};
-
-/** Interface for the kernel which applies a horizontal pass of 5x5 convolution to a tensor. */
-using CLSeparableConvolution5x5HorKernel = CLSeparableConvolutionHorKernel<5>;
-/** Interface for the kernel which applies a horizontal pass of 7x7 convolution to a tensor. */
-using CLSeparableConvolution7x7HorKernel = CLSeparableConvolutionHorKernel<7>;
-/** Interface for the kernel which applies a horizontal pass of 9x9 convolution to a tensor. */
-using CLSeparableConvolution9x9HorKernel = CLSeparableConvolutionHorKernel<9>;
-
-/** Kernel for the Vertical pass of a Separable Convolution. Currently supports 5x5, 7x7, 9x9 */
-template <unsigned int matrix_size>
-class CLSeparableConvolutionVertKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U16/S16/S32.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  scale            Scale of the convolution matrix.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     * @param[in]  data_type        Data type to use for intermeidate result. @sa data_type_for_convolution
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type = DataType::S32);
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U16/S16/S32.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  scale            Scale of the convolution matrix.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     * @param[in]  data_type        Data type to use for intermeidate result. @sa data_type_for_convolution
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type = DataType::S32);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-
-/** Interface for the kernel which applies a vertical pass of 5x5 convolution to a tensor. */
-using CLSeparableConvolution5x5VertKernel = CLSeparableConvolutionVertKernel<5>;
-/** Interface for the kernel which applies a vertical pass of 7x7 convolution to a tensor. */
-using CLSeparableConvolution7x7VertKernel = CLSeparableConvolutionVertKernel<7>;
-/** Interface for the kernel which applies a vertical pass of 9x9 convolution to a tensor. */
-using CLSeparableConvolution9x9VertKernel = CLSeparableConvolutionVertKernel<9>;
-
-/****************************************************************************************\
- *                                 Rectangle Convolution                                *
-\****************************************************************************************/
-
-/** Kernel for the running convolution on a rectangle matrix.
- *
- * @note Supports combinations of 3,5,7 and 9.
- */
-class CLConvolutionRectangleKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLConvolutionRectangleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLConvolutionRectangleKernel(const CLConvolutionRectangleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLConvolutionRectangleKernel &operator=(const CLConvolutionRectangleKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLConvolutionRectangleKernel(CLConvolutionRectangleKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLConvolutionRectangleKernel &operator=(CLConvolutionRectangleKernel &&) = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  width            Width of convolution matrix (Number of columns)
-     * @param[in]  height           Height of convolution matrix (Number of rows)
-     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined);
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  width            Width of convolution matrix (Number of columns)
-     * @param[in]  height           Height of convolution matrix (Number of rows)
-     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    BorderSize       _border_size;
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLCONVOLUTIONKERNEL_H */
diff --git a/src/core/CL/kernels/CLDerivativeKernel.cpp b/src/core/CL/kernels/CLDerivativeKernel.cpp
deleted file mode 100644
index 5ff1136..0000000
--- a/src/core/CL/kernels/CLDerivativeKernel.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLDerivativeKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLDerivativeKernel::CLDerivativeKernel()
-    : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_derivative_x(false), _run_derivative_y(false)
-{
-}
-
-BorderSize CLDerivativeKernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void CLDerivativeKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_undefined);
-}
-
-void CLDerivativeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
-    _run_derivative_x = output_x != nullptr;
-    _run_derivative_y = output_y != nullptr;
-
-    if(_run_derivative_x)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
-    }
-
-    if(_run_derivative_y)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
-    }
-
-    _input    = input;
-    _output_x = output_x;
-    _output_y = output_y;
-
-    // Set build options
-    std::set<std::string> build_opts;
-
-    if(_run_derivative_x)
-    {
-        build_opts.insert("-DGRAD_X");
-    }
-
-    if(_run_derivative_y)
-    {
-        build_opts.insert("-DGRAD_Y");
-    }
-
-    // Create kernel
-    const std::string kernel_name = std::string("derivative");
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-    constexpr unsigned int num_read_rows_per_iteration       = 3;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_access(input->info(), 0, 0, 0, 0);
-    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_processed_per_iteration);
-    if(_run_derivative_x && _run_derivative_y)
-    {
-        // TODO(COMPMID-415) Fix x-access input bug in CL kernel instead of '+2'
-        input_access = AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_processed_per_iteration + 2, num_read_rows_per_iteration);
-    }
-    else if(_run_derivative_x)
-    {
-        // TODO(COMPMID-415) Fix x-access input bug in CL kernel instead of '+2'
-        input_access = AccessWindowHorizontal(input->info(), -border_size().left, num_elems_processed_per_iteration + 2);
-    }
-    else if(_run_derivative_y)
-    {
-        input_access = AccessWindowRectangle(input->info(), 0, -border_size().top, num_elems_processed_per_iteration, num_read_rows_per_iteration);
-    }
-
-    update_window_and_padding(win,
-                              input_access,
-                              output_x_access,
-                              output_y_access);
-
-    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_undefined);
-}
-
-void CLDerivativeKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        add_2D_tensor_argument_if((_run_derivative_x), idx, _output_x, slice);
-        add_2D_tensor_argument_if((_run_derivative_y), idx, _output_y, slice);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLDerivativeKernel.h b/src/core/CL/kernels/CLDerivativeKernel.h
deleted file mode 100644
index 14dd05d..0000000
--- a/src/core/CL/kernels/CLDerivativeKernel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDERIVATIVEKERNEL_H
-#define ARM_COMPUTE_CLDERIVATIVEKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the derivative kernel. */
-class CLDerivativeKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLDerivativeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDerivativeKernel(const CLDerivativeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDerivativeKernel &operator=(const CLDerivativeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLDerivativeKernel(CLDerivativeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLDerivativeKernel &operator=(CLDerivativeKernel &&) = default;
-    /** Default destructor */
-    ~CLDerivativeKernel() = default;
-    /** Initialise the kernel's sources, destination and border
-     *
-     * @note At least one of output_x or output_y must be set
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's sources, destination and border
-     *
-     * @note At least one of output_x or output_y must be set
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input;            /**< Input tensor */
-    ICLTensor       *_output_x;         /**< Output tensor - Derivate along the X direction */
-    ICLTensor       *_output_y;         /**< Output tensor - Derivate along the Y direction */
-    bool             _run_derivative_x; /**< Do we need to run Derivative X ? */
-    bool             _run_derivative_y; /**< Do we need to run Derivative Y ? */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDERIVATIVEKERNEL_H */
diff --git a/src/core/CL/kernels/CLDilateKernel.cpp b/src/core/CL/kernels/CLDilateKernel.cpp
deleted file mode 100644
index cac5bc1..0000000
--- a/src/core/CL/kernels/CLDilateKernel.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLDilateKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-using namespace arm_compute;
-
-BorderSize CLDilateKernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void CLDilateKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined);
-}
-
-void CLDilateKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "dilate");
-
-    _input  = input;
-    _output = output;
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-}
diff --git a/src/core/CL/kernels/CLDilateKernel.h b/src/core/CL/kernels/CLDilateKernel.h
deleted file mode 100644
index 591ec8c..0000000
--- a/src/core/CL/kernels/CLDilateKernel.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDILATEKERNEL_H
-#define ARM_COMPUTE_CLDILATEKERNEL_H
-
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the dilate kernel.
- *
- */
-class CLDilateKernel : public ICLSimple2DKernel
-{
-public:
-    /**Initialise the kernel's input and output.
-     *
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /**Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDILATEKERNEL_H */
diff --git a/src/core/CL/kernels/CLErodeKernel.cpp b/src/core/CL/kernels/CLErodeKernel.cpp
deleted file mode 100644
index f6d98a5..0000000
--- a/src/core/CL/kernels/CLErodeKernel.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLErodeKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-using namespace arm_compute;
-
-BorderSize CLErodeKernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void CLErodeKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined);
-}
-
-void CLErodeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "erode");
-
-    _input  = input;
-    _output = output;
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_rows_read_pes_iteration       = 3;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_pes_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-}
diff --git a/src/core/CL/kernels/CLErodeKernel.h b/src/core/CL/kernels/CLErodeKernel.h
deleted file mode 100644
index 4da97ae..0000000
--- a/src/core/CL/kernels/CLErodeKernel.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLERODEKERNEL_H
-#define ARM_COMPUTE_CLERODEKERNEL_H
-
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the erode kernel.
- *
- */
-class CLErodeKernel : public ICLSimple2DKernel
-{
-public:
-    /**Initialise the kernel's input and output.
-     *
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /**Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLERODEKERNEL_H */
diff --git a/src/core/CL/kernels/CLFastCornersKernel.cpp b/src/core/CL/kernels/CLFastCornersKernel.cpp
deleted file mode 100644
index 7481fd1..0000000
--- a/src/core/CL/kernels/CLFastCornersKernel.cpp
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLFastCornersKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLFastCornersKernel::CLFastCornersKernel()
-    : ICLKernel(), _input(nullptr), _output(nullptr)
-{
-}
-
-BorderSize CLFastCornersKernel::border_size() const
-{
-    return BorderSize(3);
-}
-
-void CLFastCornersKernel::configure(const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, threshold, non_max_suppression, border_mode);
-}
-
-void CLFastCornersKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_MSG(border_mode != BorderMode::UNDEFINED, "Not implemented");
-
-    _input  = input;
-    _output = output;
-
-    // Create build options
-    std::set<std::string> build_opts;
-
-    if(non_max_suppression)
-    {
-        build_opts.emplace("-DUSE_MAXSUPPRESSION");
-    }
-
-    // Create kernel
-    const std::string kernel_name = std::string("fast_corners");
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts);
-
-    // Set static kernel arguments
-    unsigned int idx = 2 * num_arguments_per_2D_tensor(); // Skip the input and output parameters
-    _kernel.setArg<cl_float>(idx, static_cast<float>(threshold));
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-    constexpr unsigned int num_elems_read_per_iteration      = 7;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_mode == BorderMode::UNDEFINED, BorderSize(3));
-
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_mode == BorderMode::UNDEFINED, border_size());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(non_max_suppression);
-    _config_id += "_";
-    _config_id += lower_string(string_from_border_mode(border_mode));
-}
-
-void CLFastCornersKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        add_2D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
-
-CLCopyToArrayKernel::CLCopyToArrayKernel()
-    : ICLKernel(), _input(nullptr), _corners(nullptr), _num_buffer(nullptr)
-{
-}
-
-void CLCopyToArrayKernel::configure(const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, update_number, corners, num_buffers);
-}
-
-void CLCopyToArrayKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(corners == nullptr);
-    ARM_COMPUTE_ERROR_ON(num_buffers == nullptr);
-
-    _input      = input;
-    _corners    = corners;
-    _num_buffer = num_buffers;
-
-    std::set<std::string> build_opts;
-
-    if(update_number)
-    {
-        build_opts.emplace("-DUPDATE_NUMBER");
-    }
-
-    // Create kernel
-    const std::string kernel_name = std::string("copy_to_keypoint");
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts);
-
-    //Get how many pixels skipped in the x dimension in the previous stages
-    unsigned int offset = _input->info()->valid_region().anchor.x();
-
-    // Set static kernel arguments
-    unsigned int idx = num_arguments_per_2D_tensor(); // Skip the input and output parameters
-    _kernel.setArg<unsigned int>(idx++, _corners->max_num_values());
-    _kernel.setArg<cl_uint>(idx++, offset);
-    _kernel.setArg(idx++, *_num_buffer);
-    _kernel.setArg(idx++, _corners->cl_buffer());
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-    Window                 win                               = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-void CLCopyToArrayKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    //Initialise the _num_buffer as it used as both input and output
-    static const unsigned int zero_init = 0;
-    queue.enqueueWriteBuffer(*_num_buffer, CL_FALSE, 0, sizeof(unsigned int), &zero_init);
-
-    Window slice = window.first_slice_window_2D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLFastCornersKernel.h b/src/core/CL/kernels/CLFastCornersKernel.h
deleted file mode 100644
index 0c1b564..0000000
--- a/src/core/CL/kernels/CLFastCornersKernel.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLFASTCORNERSKERNEL_H
-#define ARM_COMPUTE_CLFASTCORNERSKERNEL_H
-
-#include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-#include <cstdint>
-
-namespace cl
-{
-class Buffer;
-}
-
-namespace arm_compute
-{
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** CL kernel to perform fast corners */
-class CLFastCornersKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLFastCornersKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFastCornersKernel(const CLFastCornersKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFastCornersKernel &operator=(const CLFastCornersKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLFastCornersKernel(CLFastCornersKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLFastCornersKernel &operator=(CLFastCornersKernel &&) = default;
-    /** Default destructor */
-    ~CLFastCornersKernel() = default;
-
-    /** Initialise the kernel.
-     *
-     * @param[in]  input               Source image. Data types supported: U8.
-     * @param[out] output              Output image. Data types supported: U8.
-     * @param[in]  threshold           Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
-     * @param[in]  non_max_suppression True if non-maxima suppresion is applied, false otherwise.
-     * @param[in]  border_mode         Strategy to use for borders.
-     */
-    void configure(const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode);
-    /** Initialise the kernel.
-     *
-     * @param[in]  compile_context     The compile context to be used.
-     * @param[in]  input               Source image. Data types supported: U8.
-     * @param[out] output              Output image. Data types supported: U8.
-     * @param[in]  threshold           Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
-     * @param[in]  non_max_suppression True if non-maxima suppresion is applied, false otherwise.
-     * @param[in]  border_mode         Strategy to use for borders.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode);
-
-    // Inherited methods overridden
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLImage *_input;
-    ICLImage       *_output;
-};
-
-/** CL kernel to copy keypoints information to ICLKeyPointArray and counts the number of key points */
-class CLCopyToArrayKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLCopyToArrayKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLCopyToArrayKernel(const CLCopyToArrayKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLCopyToArrayKernel &operator=(const CLCopyToArrayKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLCopyToArrayKernel(CLCopyToArrayKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLCopyToArrayKernel &operator=(CLCopyToArrayKernel &&) = default;
-    /** Default destructor */
-    ~CLCopyToArrayKernel() = default;
-
-    /** Initialise the kernel.
-     *
-     * @param[in]  input         Source image. Data types supported: U8.
-     * @param[in]  update_number Flag to indicate whether we need to update the number of corners
-     * @param[out] corners       Array of keypoints to store the results.
-     * @param[out] num_buffers   Number of keypoints to store the results.
-     */
-    void configure(const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers);
-    /** Initialise the kernel.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source image. Data types supported: U8.
-     * @param[in]  update_number   Flag to indicate whether we need to update the number of corners
-     * @param[out] corners         Array of keypoints to store the results.
-     * @param[out] num_buffers     Number of keypoints to store the results.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLImage   *_input;      /**< source image */
-    ICLKeyPointArray *_corners;    /**< destination array */
-    cl::Buffer       *_num_buffer; /**< CL memory to record number of key points in the array */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLFASTCORNERSKERNEL_H */
diff --git a/src/core/CL/kernels/CLGaussian3x3Kernel.cpp b/src/core/CL/kernels/CLGaussian3x3Kernel.cpp
deleted file mode 100644
index 40e9658..0000000
--- a/src/core/CL/kernels/CLGaussian3x3Kernel.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGaussian3x3Kernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-BorderSize CLGaussian3x3Kernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void CLGaussian3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined);
-}
-
-void CLGaussian3x3Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
-    _input  = input;
-    _output = output;
-
-    // Set build options
-    std::set<std::string> build_opts = { "-DMAT0=1", "-DMAT1=2", "-DMAT2=1",
-                                         "-DMAT3=2", "-DMAT4=4", "-DMAT5=2",
-                                         "-DMAT6=1", "-DMAT7=2", "-DMAT8=1",
-                                         "-DSCALE=16", "-DDATA_TYPE_OUT=uchar"
-                                       };
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "convolution3x3_static", build_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-}
diff --git a/src/core/CL/kernels/CLGaussian3x3Kernel.h b/src/core/CL/kernels/CLGaussian3x3Kernel.h
deleted file mode 100644
index 139b05d..0000000
--- a/src/core/CL/kernels/CLGaussian3x3Kernel.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGAUSSIAN3X3KERNEL_H
-#define ARM_COMPUTE_CLGAUSSIAN3X3KERNEL_H
-
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the Gaussian 3x3 filter kernel.
- *
- */
-class CLGaussian3x3Kernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGAUSSIAN3X3KERNEL_H */
diff --git a/src/core/CL/kernels/CLGaussian5x5Kernel.cpp b/src/core/CL/kernels/CLGaussian5x5Kernel.cpp
deleted file mode 100644
index 46a7576..0000000
--- a/src/core/CL/kernels/CLGaussian5x5Kernel.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGaussian5x5Kernel.h"
-
-#include <cstdint>
-
-using namespace arm_compute;
-
-void CLGaussian5x5HorKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined);
-}
-
-void CLGaussian5x5HorKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    const std::array<int16_t, 5> matrix = { 1, 4, 6, 4, 1 };
-
-    // Set arguments
-    CLSeparableConvolution5x5HorKernel::configure(compile_context, input, output, matrix.data(), border_undefined);
-}
-
-void CLGaussian5x5VertKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined);
-}
-
-void CLGaussian5x5VertKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    const uint32_t scale = 256;
-    const std::array<int16_t, 5> matrix = { 1, 4, 6, 4, 1 };
-
-    // Set arguments
-    CLSeparableConvolution5x5VertKernel::configure(compile_context, input, output, matrix.data(), scale, border_undefined);
-}
diff --git a/src/core/CL/kernels/CLGaussian5x5Kernel.h b/src/core/CL/kernels/CLGaussian5x5Kernel.h
deleted file mode 100644
index 711710b..0000000
--- a/src/core/CL/kernels/CLGaussian5x5Kernel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGAUSSIAN5X5KERNEL_H
-#define ARM_COMPUTE_CLGAUSSIAN5X5KERNEL_H
-
-#include "src/core/CL/kernels/CLConvolutionKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run the horizontal pass of 5x5 Gaussian filter on a tensor. */
-class CLGaussian5x5HorKernel : public CLSeparableConvolution5x5HorKernel
-{
-public:
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor. Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor. Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-private:
-    //Make the configure method of the parent class private
-    using CLSeparableConvolution5x5HorKernel::configure;
-};
-
-/** Interface for the kernel to run the vertical pass of 5x5 Gaussian filter on a tensor. */
-class CLGaussian5x5VertKernel : public CLSeparableConvolution5x5VertKernel
-{
-public:
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @param[in]  input            Input tensor(output of horizontal pass). Data types supported: S16.
-     * @param[out] output           Destination tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Input tensor(output of horizontal pass). Data types supported: S16.
-     * @param[out] output           Destination tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-private:
-    //Make the configure method of the parent class private
-    using CLSeparableConvolution5x5VertKernel::configure;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGAUSSIAN5X5KERNEL_H */
diff --git a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
deleted file mode 100644
index 065f7f7..0000000
--- a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGaussianPyramidKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-CLGaussianPyramidHorKernel::CLGaussianPyramidHorKernel()
-    : _l2_load_offset(0)
-{
-}
-
-BorderSize CLGaussianPyramidHorKernel::border_size() const
-{
-    return BorderSize{ 0, 2 };
-}
-
-void CLGaussianPyramidHorKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLGaussianPyramidHorKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
-
-    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
-    }
-
-    _input  = input;
-    _output = output;
-
-    // Create kernel
-    const std::string kernel_name = std::string("gaussian1x5_sub_x");
-    _kernel                       = create_kernel(compile_context, kernel_name);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-    constexpr unsigned int num_elems_read_per_iteration      = 20;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    const float            scale_x                           = static_cast<float>(output->info()->dimension(0)) / input->info()->dimension(0);
-
-    Window                 win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration, scale_x);
-
-    // Sub sampling selects odd pixels (1, 3, 5, ...) for images with even
-    // width and even pixels (0, 2, 4, ...) for images with odd width. (Whether
-    // a pixel is even or odd is determined based on the tensor shape not the
-    // valid region!)
-    // Thus the offset from which the first pixel (L2) for the convolution is
-    // loaded depends on the anchor and shape of the valid region.
-    // In the case of an even shape (= even image width) we need to load L2
-    // from -2 if the anchor is odd and from -1 if the anchor is even. That
-    // makes sure that L2 is always loaded from an odd pixel.
-    // On the other hand, for an odd shape (= odd image width) we need to load
-    // L2 from -1 if the anchor is odd and from -2 if the anchor is even to
-    // achieve the opposite effect.
-    // The condition can be simplified to checking whether anchor + shape is
-    // odd (-2) or even (-1) as only adding an odd and an even number will have
-    // an odd result.
-    _l2_load_offset = -border_size().left;
-
-    if((_input->info()->valid_region().anchor[0] + _input->info()->valid_region().shape[0]) % 2 == 0)
-    {
-        _l2_load_offset += 1;
-    }
-
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input->info(), _l2_load_offset, num_elems_read_per_iteration),
-                              output_access);
-
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-void CLGaussianPyramidHorKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window win_in(window);
-    win_in.shift(Window::DimX, _l2_load_offset);
-
-    //The output is half the width of the input:
-    Window win_out(window);
-    win_out.scale(Window::DimX, 0.5f);
-
-    Window slice_in  = win_in.first_slice_window_2D();
-    Window slice_out = win_out.first_slice_window_2D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice_in);
-        add_2D_tensor_argument(idx, _output, slice_out);
-        enqueue(queue, *this, slice_out, lws_hint());
-    }
-    while(win_in.slide_window_slice_2D(slice_in) && win_out.slide_window_slice_2D(slice_out));
-}
-
-CLGaussianPyramidVertKernel::CLGaussianPyramidVertKernel()
-    : _t2_load_offset(0)
-{
-}
-
-BorderSize CLGaussianPyramidVertKernel::border_size() const
-{
-    return BorderSize{ 2, 0 };
-}
-
-void CLGaussianPyramidVertKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLGaussianPyramidVertKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
-
-    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
-    }
-
-    _input  = input;
-    _output = output;
-
-    // Create kernel
-    const std::string kernel_name = std::string("gaussian5x1_sub_y");
-    _kernel                       = create_kernel(compile_context, "gaussian5x1_sub_y");
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_rows_processed_per_iteration  = 2;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 8;
-    constexpr unsigned int num_rows_per_iteration            = 5;
-
-    const float scale_y = static_cast<float>(output->info()->dimension(1)) / input->info()->dimension(1);
-
-    Window                win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_rows_processed_per_iteration));
-    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_per_iteration, 1.f, scale_y);
-
-    // Determine whether we need to load even or odd rows. See above for a
-    // detailed explanation.
-    _t2_load_offset = -border_size().top;
-
-    if((_input->info()->valid_region().anchor[1] + _input->info()->valid_region().shape[1]) % 2 == 0)
-    {
-        _t2_load_offset += 1;
-    }
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input->info(), 0, _t2_load_offset, num_elems_read_per_iteration, num_rows_per_iteration),
-                              output_access);
-
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-void CLGaussianPyramidVertKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(window.x().step() != 8);
-    ARM_COMPUTE_ERROR_ON(window.y().step() % 2);
-
-    Window win_in(window);
-    win_in.shift(Window::DimY, _t2_load_offset);
-
-    Window win_out(window);
-    win_out.scale(Window::DimY, 0.5f);
-
-    Window slice_in  = win_in.first_slice_window_2D();
-    Window slice_out = win_out.first_slice_window_2D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice_in);
-        add_2D_tensor_argument(idx, _output, slice_out);
-        enqueue(queue, *this, slice_out, lws_hint());
-    }
-    while(win_in.slide_window_slice_2D(slice_in) && win_out.slide_window_slice_2D(slice_out));
-}
diff --git a/src/core/CL/kernels/CLGaussianPyramidKernel.h b/src/core/CL/kernels/CLGaussianPyramidKernel.h
deleted file mode 100644
index a659544..0000000
--- a/src/core/CL/kernels/CLGaussianPyramidKernel.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H
-#define ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H
-
-#include "src/core/CL/ICLSimpleKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform a Gaussian filter and half scaling across width (horizontal pass) */
-class CLGaussianPyramidHorKernel : public ICLSimpleKernel
-{
-public:
-    /** Default constructor */
-    CLGaussianPyramidHorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGaussianPyramidHorKernel(const CLGaussianPyramidHorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGaussianPyramidHorKernel &operator=(const CLGaussianPyramidHorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGaussianPyramidHorKernel(CLGaussianPyramidHorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGaussianPyramidHorKernel &operator=(CLGaussianPyramidHorKernel &&) = default;
-    /** Default destructor */
-    ~CLGaussianPyramidHorKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  input  Source tensor. Data types supported: U8.
-     * @param[out] output Destination tensor. Output should have half the input width. Data types supported: U16.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8.
-     * @param[out] output          Destination tensor. Output should have half the input width. Data types supported: U16.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    int _l2_load_offset;
-};
-
-/** OpenCL kernel to perform a Gaussian filter and half scaling across height (vertical pass) */
-class CLGaussianPyramidVertKernel : public ICLSimpleKernel
-{
-public:
-    /** Default constructor */
-    CLGaussianPyramidVertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGaussianPyramidVertKernel(const CLGaussianPyramidVertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGaussianPyramidVertKernel &operator=(const CLGaussianPyramidVertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGaussianPyramidVertKernel(CLGaussianPyramidVertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGaussianPyramidVertKernel &operator=(CLGaussianPyramidVertKernel &&) = default;
-    /** Default destructor */
-    ~CLGaussianPyramidVertKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  input  Source tensor. Data types supported: U16.
-     * @param[out] output Destination tensor. Output should have half the input height. Data types supported: U8.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U16.
-     * @param[out] output          Destination tensor. Output should have half the input height. Data types supported: U8.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    int _t2_load_offset;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H */
diff --git a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
deleted file mode 100644
index cd3f1ee..0000000
--- a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLHOGDescriptorKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <sstream>
-#include <string>
-
-using namespace arm_compute;
-
-CLHOGOrientationBinningKernel::CLHOGOrientationBinningKernel()
-    : _input_magnitude(nullptr), _input_phase(nullptr), _output(nullptr), _cell_size()
-{
-}
-
-void CLHOGOrientationBinningKernel::configure(const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input_magnitude, input_phase, output, hog_info);
-}
-
-void CLHOGOrientationBinningKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_magnitude, 1, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_phase, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins(), DataType::F32);
-    ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimX) != input_phase->info()->dimension(Window::DimX));
-    ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimY) != input_phase->info()->dimension(Window::DimY));
-
-    _input_magnitude = input_magnitude;
-    _input_phase     = input_phase;
-    _output          = output;
-    _cell_size       = hog_info->cell_size();
-
-    float phase_scale = (PhaseType::SIGNED == hog_info->phase_type() ? hog_info->num_bins() / 360.0f : hog_info->num_bins() / 180.0f);
-    phase_scale *= (PhaseType::SIGNED == hog_info->phase_type() ? 360.0f / 255.0f : 1.0f);
-
-    std::stringstream args_str;
-    args_str << "-DCELL_WIDTH=" << hog_info->cell_size().width << " ";
-    args_str << "-DCELL_HEIGHT=" << hog_info->cell_size().height << " ";
-    args_str << "-DNUM_BINS=" << hog_info->num_bins() << " ";
-    args_str << "-DPHASE_SCALE=" << phase_scale << " ";
-
-    // Construct kernel name
-    std::set<std::string> build_opts = {};
-    build_opts.insert(args_str.str());
-
-    // Create kernel
-    const std::string kernel_name = std::string("hog_orientation_binning");
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts);
-
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-    constexpr unsigned int num_elems_read_per_iteration      = 1;
-    const unsigned int     num_rows_read_per_iteration       = hog_info->cell_size().height;
-    constexpr unsigned int num_elems_written_per_iteration   = 1;
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input_magnitude->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              AccessWindowRectangle(input_phase->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              output_access);
-
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input_magnitude->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input_magnitude->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input_magnitude->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-void CLHOGOrientationBinningKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        // Compute slice for the magnitude and phase tensors
-        Window slice_mag_phase = window.first_slice_window_2D();
-        slice_mag_phase.set(Window::DimX, Window::Dimension(window.x().start() * _cell_size.width, window.x().start() * _cell_size.width, _cell_size.width));
-        slice_mag_phase.set(Window::DimY, Window::Dimension(window.y().start() * _cell_size.height, window.y().start() * _cell_size.height, _cell_size.height));
-
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input_magnitude, slice_mag_phase);
-        add_2D_tensor_argument(idx, _input_phase, slice_mag_phase);
-        add_2D_tensor_argument(idx, _output, slice);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
-
-CLHOGBlockNormalizationKernel::CLHOGBlockNormalizationKernel()
-    : _input(nullptr), _output(nullptr), _num_cells_per_block_stride()
-{
-}
-
-void CLHOGBlockNormalizationKernel::configure(const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, hog_info);
-}
-
-void CLHOGBlockNormalizationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info)
-{
-    ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, hog_info->num_bins(), DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
-
-    // Number of cells per block
-    const Size2D num_cells_per_block(hog_info->block_size().width / hog_info->cell_size().width,
-                                     hog_info->block_size().height / hog_info->cell_size().height);
-
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins() * num_cells_per_block.area(), DataType::F32);
-
-    // Number of cells per block stride
-    const Size2D num_cells_per_block_stride(hog_info->block_stride().width / hog_info->cell_size().width,
-                                            hog_info->block_stride().height / hog_info->cell_size().height);
-
-    _input                      = input;
-    _output                     = output;
-    _num_cells_per_block_stride = num_cells_per_block_stride;
-
-    std::stringstream args_str;
-    args_str << "-DL2_HYST_THRESHOLD=" << hog_info->l2_hyst_threshold() << " ";
-    args_str << "-DNUM_CELLS_PER_BLOCK_HEIGHT=" << num_cells_per_block.height << " ";
-    args_str << "-DNUM_BINS_PER_BLOCK_X=" << num_cells_per_block.width *hog_info->num_bins() << " ";
-    args_str << "-DNUM_BINS_PER_BLOCK=" << _output->info()->num_channels() << " ";
-    args_str << "-DL2_NORM=" << static_cast<int>(HOGNormType::L2_NORM) << " ";
-    args_str << "-DL1_NORM=" << static_cast<int>(HOGNormType::L1_NORM) << " ";
-    args_str << "-DL2HYS_NORM=" << static_cast<int>(HOGNormType::L2HYS_NORM) << " ";
-    args_str << "-DHOG_NORM_TYPE=" << static_cast<int>(hog_info->normalization_type()) << " ";
-
-    // Construct kernel name
-    std::set<std::string> build_opts = {};
-    build_opts.insert(args_str.str());
-
-    const std::string kernel_name = std::string("hog_block_normalization");
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts);
-
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-    constexpr unsigned int num_elems_read_per_iteration      = 1;
-    const unsigned int     num_rows_read_per_iteration       = num_cells_per_block.height;
-    constexpr unsigned int num_elems_written_per_iteration   = 1;
-    const unsigned int     num_rows_written_per_iteration    = num_cells_per_block.height;
-
-    // Configure kernel window
-    Window                win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-void CLHOGBlockNormalizationKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        // Compute slice for the magnitude and phase tensors
-        Window slice_in = window.first_slice_window_2D();
-        slice_in.set_dimension_step(Window::DimX, _num_cells_per_block_stride.width);
-        slice_in.set_dimension_step(Window::DimY, _num_cells_per_block_stride.height);
-
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice_in);
-        add_2D_tensor_argument(idx, _output, slice);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLHOGDescriptorKernel.h b/src/core/CL/kernels/CLHOGDescriptorKernel.h
deleted file mode 100644
index eee2fa3..0000000
--- a/src/core/CL/kernels/CLHOGDescriptorKernel.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H
-#define ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H
-
-#include "arm_compute/core/IHOG.h"
-#include "arm_compute/core/Size2D.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** OpenCL kernel to perform HOG Orientation Binning */
-class CLHOGOrientationBinningKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLHOGOrientationBinningKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGOrientationBinningKernel(const CLHOGOrientationBinningKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGOrientationBinningKernel &operator=(const CLHOGOrientationBinningKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHOGOrientationBinningKernel(CLHOGOrientationBinningKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHOGOrientationBinningKernel &operator=(CLHOGOrientationBinningKernel &&) = default;
-    /** Default destructor */
-    ~CLHOGOrientationBinningKernel() = default;
-
-    /**  Initialise the kernel's inputs, output and HOG's metadata
-     *
-     * @param[in]  input_magnitude Input tensor which stores the magnitude of the gradient for each pixel. Data type supported: S16.
-     * @param[in]  input_phase     Input tensor which stores the phase of the gradient for each pixel. Data type supported: U8
-     * @param[out] output          Output tensor which stores the local HOG for each cell. DataType supported: F32. Number of channels supported: equal to the number of histogram bins per cell
-     * @param[in]  hog_info        HOG's metadata
-     */
-    void configure(const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info);
-    /**  Initialise the kernel's inputs, output and HOG's metadata
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input_magnitude Input tensor which stores the magnitude of the gradient for each pixel. Data type supported: S16.
-     * @param[in]  input_phase     Input tensor which stores the phase of the gradient for each pixel. Data type supported: U8
-     * @param[out] output          Output tensor which stores the local HOG for each cell. DataType supported: F32. Number of channels supported: equal to the number of histogram bins per cell
-     * @param[in]  hog_info        HOG's metadata
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input_magnitude;
-    const ICLTensor *_input_phase;
-    ICLTensor       *_output;
-    Size2D           _cell_size;
-};
-
-/** OpenCL kernel to perform HOG block normalization */
-class CLHOGBlockNormalizationKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLHOGBlockNormalizationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGBlockNormalizationKernel(const CLHOGBlockNormalizationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGBlockNormalizationKernel &operator=(const CLHOGBlockNormalizationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHOGBlockNormalizationKernel(CLHOGBlockNormalizationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHOGBlockNormalizationKernel &operator=(CLHOGBlockNormalizationKernel &&) = default;
-    /** Default destructor */
-    ~CLHOGBlockNormalizationKernel() = default;
-
-    /** Initialise the kernel's input, output and HOG's metadata
-     *
-     * @param[in]  input    Input tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
-     * @param[out] output   Output tensor which stores the normalised blocks. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
-     * @param[in]  hog_info HOG's metadata
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info);
-    /** Initialise the kernel's input, output and HOG's metadata
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
-     * @param[out] output          Output tensor which stores the normalised blocks. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
-     * @param[in]  hog_info        HOG's metadata
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    Size2D           _num_cells_per_block_stride;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H */
diff --git a/src/core/CL/kernels/CLHOGDetectorKernel.cpp b/src/core/CL/kernels/CLHOGDetectorKernel.cpp
deleted file mode 100644
index 861155b..0000000
--- a/src/core/CL/kernels/CLHOGDetectorKernel.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLHOGDetectorKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLHOG.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-CLHOGDetectorKernel::CLHOGDetectorKernel()
-    : _input(nullptr), _detection_windows(), _num_detection_windows(nullptr)
-{
-}
-
-void CLHOGDetectorKernel::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows, const Size2D &detection_window_stride,
-                                    float threshold, uint16_t idx_class)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, hog, detection_windows, num_detection_windows, detection_window_stride, threshold, idx_class);
-}
-
-void CLHOGDetectorKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows,
-                                    const Size2D &detection_window_stride,
-                                    float threshold, uint16_t idx_class)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(hog == nullptr);
-    ARM_COMPUTE_ERROR_ON(detection_windows == nullptr);
-    ARM_COMPUTE_ERROR_ON(num_detection_windows == nullptr);
-    ARM_COMPUTE_ERROR_ON((detection_window_stride.width % hog->info()->block_stride().width) != 0);
-    ARM_COMPUTE_ERROR_ON((detection_window_stride.height % hog->info()->block_stride().height) != 0);
-
-    const Size2D &detection_window_size = hog->info()->detection_window_size();
-    const Size2D &block_size            = hog->info()->block_size();
-    const Size2D &block_stride          = hog->info()->block_stride();
-
-    _input                 = input;
-    _detection_windows     = detection_windows;
-    _num_detection_windows = num_detection_windows;
-
-    const unsigned int num_bins_per_descriptor_x   = ((detection_window_size.width - block_size.width) / block_stride.width + 1) * input->info()->num_channels();
-    const unsigned int num_blocks_per_descriptor_y = (detection_window_size.height - block_size.height) / block_stride.height + 1;
-
-    ARM_COMPUTE_ERROR_ON((num_bins_per_descriptor_x * num_blocks_per_descriptor_y + 1) != hog->info()->descriptor_size());
-
-    std::stringstream args_str;
-    args_str << "-DNUM_BLOCKS_PER_DESCRIPTOR_Y=" << num_blocks_per_descriptor_y << " ";
-    args_str << "-DNUM_BINS_PER_DESCRIPTOR_X=" << num_bins_per_descriptor_x << " ";
-    args_str << "-DTHRESHOLD=" << threshold << " ";
-    args_str << "-DMAX_NUM_DETECTION_WINDOWS=" << detection_windows->max_num_values() << " ";
-    args_str << "-DIDX_CLASS=" << idx_class << " ";
-    args_str << "-DDETECTION_WINDOW_WIDTH=" << detection_window_size.width << " ";
-    args_str << "-DDETECTION_WINDOW_HEIGHT=" << detection_window_size.height << " ";
-    args_str << "-DDETECTION_WINDOW_STRIDE_WIDTH=" << detection_window_stride.width << " ";
-    args_str << "-DDETECTION_WINDOW_STRIDE_HEIGHT=" << detection_window_stride.height << " ";
-
-    // Construct kernel name
-    std::set<std::string> build_opts = {};
-    build_opts.insert(args_str.str());
-
-    // Create kernel
-    const std::string kernel_name = std::string("hog_detector");
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts);
-
-    // Set static kernel arguments
-    unsigned int idx = num_arguments_per_2D_tensor(); // Skip the input parameters
-    _kernel.setArg(idx++, hog->cl_buffer());
-    _kernel.setArg(idx++, detection_windows->cl_buffer());
-    _kernel.setArg(idx++, *_num_detection_windows);
-
-    // Get the number of blocks along the x and y directions of the input tensor
-    const ValidRegion &valid_region = input->info()->valid_region();
-    const size_t       num_blocks_x = valid_region.shape[0];
-    const size_t       num_blocks_y = valid_region.shape[1];
-
-    // Get the number of blocks along the x and y directions of the detection window
-    const size_t num_blocks_per_detection_window_x = detection_window_size.width / block_stride.width;
-    const size_t num_blocks_per_detection_window_y = detection_window_size.height / block_stride.height;
-
-    const size_t window_step_x = detection_window_stride.width / block_stride.width;
-    const size_t window_step_y = detection_window_stride.height / block_stride.height;
-
-    // Configure kernel window
-    Window win;
-    win.set(Window::DimX, Window::Dimension(0, floor_to_multiple(num_blocks_x - num_blocks_per_detection_window_x, window_step_x) + window_step_x, window_step_x));
-    win.set(Window::DimY, Window::Dimension(0, floor_to_multiple(num_blocks_y - num_blocks_per_detection_window_y, window_step_y) + window_step_y, window_step_y));
-
-    constexpr unsigned int num_elems_read_per_iteration = 1;
-    const unsigned int     num_rows_read_per_iteration  = num_blocks_per_descriptor_y;
-
-    update_window_and_padding(win, AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration));
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-void CLHOGDetectorKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLHOGDetectorKernel.h b/src/core/CL/kernels/CLHOGDetectorKernel.h
deleted file mode 100644
index c28e6eb..0000000
--- a/src/core/CL/kernels/CLHOGDetectorKernel.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHOGDETECTORKERNEL_H
-#define ARM_COMPUTE_CLHOGDETECTORKERNEL_H
-
-#include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/ICLHOG.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace cl
-{
-class Buffer;
-}
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform HOG detector kernel using linear SVM */
-class CLHOGDetectorKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLHOGDetectorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGDetectorKernel(const CLHOGDetectorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGDetectorKernel &operator=(const CLHOGDetectorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHOGDetectorKernel(CLHOGDetectorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHOGDetectorKernel &operator=(CLHOGDetectorKernel &&) = default;
-    /** Default destructor */
-    ~CLHOGDetectorKernel() = default;
-
-    /** Initialise the kernel's input, HOG data-object, detection window, the stride of the detection window, the threshold and index of the object to detect
-     *
-     * @param[in]  input                   Input tensor which stores the HOG descriptor obtained with @ref CLHOGOrientationBinningKernel. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
-     * @param[in]  hog                     HOG data object used by @ref CLHOGOrientationBinningKernel and  @ref CLHOGBlockNormalizationKernel
-     * @param[out] detection_windows       Array of @ref DetectionWindow. This array stores all the detected objects
-     * @param[in]  num_detection_windows   Number of detected objects
-     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
-     *                                     It must be multiple of the hog->info()->block_stride()
-     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
-     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
-     */
-    void configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows, const Size2D &detection_window_stride, float threshold = 0.0f,
-                   uint16_t idx_class = 0);
-    /** Initialise the kernel's input, HOG data-object, detection window, the stride of the detection window, the threshold and index of the object to detect
-     *
-     * @param[in]  compile_context         The compile context to be used.
-     * @param[in]  input                   Input tensor which stores the HOG descriptor obtained with @ref CLHOGOrientationBinningKernel. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
-     * @param[in]  hog                     HOG data object used by @ref CLHOGOrientationBinningKernel and  @ref CLHOGBlockNormalizationKernel
-     * @param[out] detection_windows       Array of @ref DetectionWindow. This array stores all the detected objects
-     * @param[in]  num_detection_windows   Number of detected objects
-     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
-     *                                     It must be multiple of the hog->info()->block_stride()
-     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
-     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows,
-                   const Size2D &detection_window_stride, float threshold = 0.0f,
-                   uint16_t idx_class = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue);
-
-private:
-    const ICLTensor         *_input;
-    ICLDetectionWindowArray *_detection_windows;
-    cl::Buffer              *_num_detection_windows;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLHOGDETECTORKERNEL_H */
diff --git a/src/core/CL/kernels/CLHarrisCornersKernel.cpp b/src/core/CL/kernels/CLHarrisCornersKernel.cpp
deleted file mode 100644
index cbc056f..0000000
--- a/src/core/CL/kernels/CLHarrisCornersKernel.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLHarrisCornersKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <sstream>
-#include <string>
-
-using namespace arm_compute;
-
-CLHarrisScoreKernel::CLHarrisScoreKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr), _sensitivity(), _strength_thresh(), _norm_factor(), _border_size(0)
-{
-}
-
-BorderSize CLHarrisScoreKernel::border_size() const
-{
-    return _border_size;
-}
-
-void CLHarrisScoreKernel::configure(const ICLImage *input1, const ICLImage *input2, ICLImage *output,
-                                    int32_t block_size, float norm_factor, float strength_thresh, float sensitivity,
-                                    bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, block_size, norm_factor, strength_thresh, sensitivity, border_undefined);
-}
-
-void CLHarrisScoreKernel::configure(const CLCompileContext &compile_context, const ICLImage *input1, const ICLImage *input2, ICLImage *output,
-                                    int32_t block_size, float norm_factor, float strength_thresh, float sensitivity,
-                                    bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
-    ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7));
-    ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
-
-    _input1          = input1;
-    _input2          = input2;
-    _output          = output;
-    _sensitivity     = sensitivity;
-    _strength_thresh = strength_thresh;
-    _norm_factor     = norm_factor;
-    _border_size     = BorderSize(block_size / 2);
-
-    // Select kernel
-    std::stringstream harris_score_kernel_name;
-    harris_score_kernel_name << "harris_score_" << block_size << "x" << block_size;
-
-    // Create build options
-    std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())) };
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, harris_score_kernel_name.str(), build_opts);
-
-    // Set static kernel arguments
-    unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg(idx++, sensitivity);
-    _kernel.setArg(idx++, strength_thresh);
-    _kernel.setArg(idx++, norm_factor);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
-    constexpr unsigned int num_elems_written_per_iteration   = 4;
-    const unsigned int     num_elems_read_per_iteration      = block_size == 7 ? 10 : 8;
-    const unsigned int     num_rows_read_per_iteration       = block_size;
-
-    Window win = calculate_max_window(*_input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input1_access(input1->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowRectangle  input2_access(input2->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input1_access, input2_access, output_access);
-
-    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), input2->info()->valid_region());
-    output_access.set_valid_region(win, valid_region, border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = harris_score_kernel_name.str();
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input1->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input1->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input1->info()->dimension(1));
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input2->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input2->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input2->info()->dimension(1));
-}
-
-void CLHarrisScoreKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input1, slice);
-        add_2D_tensor_argument(idx, _input2, slice);
-        add_2D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLHarrisCornersKernel.h b/src/core/CL/kernels/CLHarrisCornersKernel.h
deleted file mode 100644
index 6482b0a..0000000
--- a/src/core/CL/kernels/CLHarrisCornersKernel.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHARRISCORNERSKERNEL_H
-#define ARM_COMPUTE_CLHARRISCORNERSKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface for the harris score kernel.
- *
- * @note The implementation supports 3, 5, and 7 for the block_size.
- */
-class CLHarrisScoreKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLHarrisScoreKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHarrisScoreKernel(const CLHarrisScoreKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHarrisScoreKernel &operator=(const CLHarrisScoreKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHarrisScoreKernel(CLHarrisScoreKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHarrisScoreKernel &operator=(CLHarrisScoreKernel &&) = default;
-    /** Default destructor */
-    ~CLHarrisScoreKernel() = default;
-
-    /** Setup the kernel parameters
-     *
-     * @param[in]  input1           Source image (gradient X). Data types supported S16, S32. (Must be the same as input2)
-     * @param[in]  input2           Source image (gradient Y). Data types supported S16, S32. (Must be the same as input1)
-     * @param[out] output           Destination image (harris score). Data types supported F32
-     * @param[in]  block_size       The block window size used to compute the Harris Corner score.  Supports: 3, 5 and 7
-     * @param[in]  norm_factor      Normalization factor to use accordingly with the gradient size (Must be different from 0)
-     * @param[in]  strength_thresh  Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
-     * @param[in]  sensitivity      Sensitivity threshold k from the Harris-Stephens equation.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLImage *input1, const ICLImage *input2, ICLImage *output,
-                   int32_t block_size, float norm_factor, float strength_thresh, float sensitivity,
-                   bool border_undefined);
-    /** Setup the kernel parameters
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input1           Source image (gradient X). Data types supported S16, S32. (Must be the same as input2)
-     * @param[in]  input2           Source image (gradient Y). Data types supported S16, S32. (Must be the same as input1)
-     * @param[out] output           Destination image (harris score). Data types supported F32
-     * @param[in]  block_size       The block window size used to compute the Harris Corner score.  Supports: 3, 5 and 7
-     * @param[in]  norm_factor      Normalization factor to use accordingly with the gradient size (Must be different from 0)
-     * @param[in]  strength_thresh  Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
-     * @param[in]  sensitivity      Sensitivity threshold k from the Harris-Stephens equation.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input1, const ICLImage *input2, ICLImage *output,
-                   int32_t block_size, float norm_factor, float strength_thresh, float sensitivity,
-                   bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-protected:
-    const ICLImage *_input1;          /**< Source image - Gx component */
-    const ICLImage *_input2;          /**< Source image - Gy component */
-    ICLImage       *_output;          /**< Source image - Harris score */
-    float           _sensitivity;     /**< Sensitivity value */
-    float           _strength_thresh; /**< Threshold value */
-    float           _norm_factor;     /**< Normalization factor */
-    BorderSize      _border_size;     /**< Border size */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLHARRISCORNERSKERNEL_H */
diff --git a/src/core/CL/kernels/CLHistogramKernel.cpp b/src/core/CL/kernels/CLHistogramKernel.cpp
deleted file mode 100644
index ca5322a..0000000
--- a/src/core/CL/kernels/CLHistogramKernel.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLHistogramKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLDistribution1D.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <cstring>
-#include <string>
-
-using namespace arm_compute;
-
-// each thread handle 16 pixels
-constexpr signed int pixels_per_item = 16;
-
-// local work group size in X dimension
-constexpr unsigned int local_x_size = 16;
-
-CLHistogramKernel::CLHistogramKernel()
-    : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLHistogramKernel::configure(const ICLImage *input, ICLDistribution1D *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLHistogramKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-
-    // Check input size
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-
-    // Check offset
-    ARM_COMPUTE_ERROR_ON_MSG(0 > output->offset() || output->offset() > 256, "Offset is larger than the image value range.");
-
-    // Check range
-    ARM_COMPUTE_ERROR_ON_MSG(output->range() > 256 /* max range */, "Range larger than the image value range.");
-
-    _input  = input;
-    _output = output;
-
-    if(_input->info()->dimension(0) < pixels_per_item)
-    {
-        return;
-    }
-
-    unsigned int num_bins    = _output->num_bins();
-    unsigned int window_size = _output->window();
-    unsigned int offset      = _output->offset();
-    unsigned int range       = _output->range();
-    unsigned int offrange    = offset + range;
-    unsigned int bin_size    = _output->size();
-    unsigned int buffer_size = bin_size + 1; // We need one extra place for pixels that don't meet the conditions
-
-    // Create kernel
-    bool              is_fixed_size = (256 == num_bins) && (1 == window_size) && (0 == offset) && (256 == offrange);
-    const std::string kernel_name   = is_fixed_size ? "hist_local_kernel_fixed" : "hist_local_kernel";
-    _kernel                         = create_kernel(compile_context, kernel_name);
-
-    // Set static kernel arguments
-    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg(idx++, buffer_size, nullptr);
-    _kernel.setArg(idx++, _output->cl_buffer());
-    if(!is_fixed_size)
-    {
-        _kernel.setArg<cl_uint>(idx++, num_bins);
-        _kernel.setArg<cl_uint>(idx++, offset);
-        _kernel.setArg<cl_uint>(idx++, range);
-        _kernel.setArg<cl_uint>(idx++, offrange);
-    }
-
-    // We only run histogram on Image, therefore only 2 dimensions here
-    unsigned int end_position = (_input->info()->dimension(0) / pixels_per_item) * pixels_per_item;
-
-    // Configure kernel window
-    Window win;
-    win.set(0, Window::Dimension(0, end_position, pixels_per_item));
-    win.set(1, Window::Dimension(0, _input->info()->dimension(1)));
-
-    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, pixels_per_item));
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-void CLHistogramKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-    // TODO (COMPMID-679): Add CLMemFill
-    _output->map(queue, true);
-    ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
-    memset(_output->buffer(), 0, _output->size());
-    _output->unmap(queue);
-
-    if(_input->info()->dimension(0) < pixels_per_item)
-    {
-        return;
-    }
-
-    Window             slice = window.first_slice_window_2D();
-    const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
-    cl::NDRange        lws   = (local_x_size < gws_x) ? cl::NDRange(local_x_size, 1) : cl::NDRange(1, 1);
-
-    do
-    {
-        /* Run the core part which has width can be divided by 16 */
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-
-        enqueue(queue, *this, slice, lws);
-    }
-    while(window.slide_window_slice_2D(slice));
-}
-
-CLHistogramBorderKernel::CLHistogramBorderKernel()
-    : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLHistogramBorderKernel::configure(const ICLImage *input, ICLDistribution1D *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLHistogramBorderKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-
-    // Check input size
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-
-    // Check offset
-    ARM_COMPUTE_ERROR_ON_MSG(0 > output->offset() || output->offset() > 256, "Offset is larger than the image value range.");
-
-    // Check range
-    ARM_COMPUTE_ERROR_ON_MSG(output->range() > 256 /* max range */, "Range larger than the image value range.");
-
-    // We only run histogram on Image, therefore only 2 dimensions here
-    unsigned int start_position = (input->info()->dimension(0) / pixels_per_item) * pixels_per_item;
-
-    if(start_position >= input->info()->dimension(0))
-    {
-        return; // no need to run histogram border kernel
-    }
-
-    _input  = input;
-    _output = output;
-
-    unsigned int num_bins    = _output->num_bins();
-    unsigned int window_size = _output->window();
-    unsigned int offset      = _output->offset();
-    unsigned int range       = _output->range();
-    unsigned int offrange    = offset + range;
-
-    // Create kernel
-    bool              is_fixed_size = (256 == num_bins) && (1 == window_size) && (0 == offset) && (256 == offrange);
-    const std::string kernel_name   = is_fixed_size ? "hist_border_kernel_fixed" : "hist_border_kernel";
-    _kernel                         = create_kernel(compile_context, kernel_name);
-
-    // Set static kernel arguments
-    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg(idx++, _output->cl_buffer());
-    if(!is_fixed_size)
-    {
-        _kernel.setArg<cl_uint>(idx++, num_bins);
-        _kernel.setArg<cl_uint>(idx++, offset);
-        _kernel.setArg<cl_uint>(idx++, range);
-        _kernel.setArg<cl_uint>(idx++, offrange);
-    }
-
-    // Configure kernel window
-    Window win;
-    win.set(0, Window::Dimension(start_position, _input->info()->dimension(0)));
-    win.set(1, Window::Dimension(0, _input->info()->dimension(1)));
-    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, 1));
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-void CLHistogramBorderKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    if(window.x().start() >= window.x().end())
-    {
-        return;
-    }
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-    cl::NDRange lws = cl::NDRange(1, 1);
-
-    Window slice = window.first_slice_window_2D();
-
-    do
-    {
-        /* Run the border part which has width cannot be divided by 16 */
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-
-        enqueue(queue, *this, slice, lws);
-    }
-    while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLHistogramKernel.h b/src/core/CL/kernels/CLHistogramKernel.h
deleted file mode 100644
index 9c97c65..0000000
--- a/src/core/CL/kernels/CLHistogramKernel.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHISTOGRAMKERNEL_H
-#define ARM_COMPUTE_CLHISTOGRAMKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLDistribution1D;
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface to run the histogram kernel. This kernel processes the part of image with width can be divided by 16.
- *  If the image width is not a multiple of 16, remaining pixels have to be processed with the @ref CLHistogramBorderKernel
- */
-class CLHistogramKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLHistogramKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHistogramKernel(const CLHistogramKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHistogramKernel &operator=(const CLHistogramKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHistogramKernel(CLHistogramKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHistogramKernel &operator=(CLHistogramKernel &&) = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input  Source image. Data types supported: U8.
-     * @param[out] output Destination distribution.
-     */
-    void configure(const ICLImage *input, ICLDistribution1D *output);
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source image. Data types supported: U8.
-     * @param[out] output          Destination distribution.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLImage    *_input;
-    ICLDistribution1D *_output;
-};
-
-/** Interface to run the histogram kernel to handle the leftover part of image
- *
- */
-class CLHistogramBorderKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLHistogramBorderKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHistogramBorderKernel(const CLHistogramBorderKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHistogramBorderKernel &operator=(const CLHistogramBorderKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHistogramBorderKernel(CLHistogramBorderKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHistogramBorderKernel &operator=(CLHistogramBorderKernel &&) = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input  Source image. Data types supported: U8.
-     * @param[out] output Destination distribution.
-     */
-    void configure(const ICLImage *input, ICLDistribution1D *output);
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source image. Data types supported: U8.
-     * @param[out] output          Destination distribution.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLImage    *_input;
-    ICLDistribution1D *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLHISTOGRAMKERNEL_H*/
diff --git a/src/core/CL/kernels/CLIntegralImageKernel.cpp b/src/core/CL/kernels/CLIntegralImageKernel.cpp
deleted file mode 100644
index 5e5683d..0000000
--- a/src/core/CL/kernels/CLIntegralImageKernel.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLIntegralImageKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-
-using namespace arm_compute;
-
-void CLIntegralImageHorKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLIntegralImageHorKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32);
-
-    _input  = input;
-    _output = output;
-
-    // Create kernel
-    const std::string kernel_name = std::string("integral_horizontal");
-    _kernel                       = create_kernel(compile_context, kernel_name);
-
-    // Configure kernel window
-    const unsigned int num_elems_processed_per_iteration = input->info()->dimension(0);
-    const unsigned int num_elems_accessed_per_iteration  = ceil_to_multiple(num_elems_processed_per_iteration, 16);
-
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_accessed_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input->info(), 0, num_elems_accessed_per_iteration),
-                              output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-CLIntegralImageVertKernel::CLIntegralImageVertKernel()
-    : _in_out(nullptr)
-{
-}
-
-void CLIntegralImageVertKernel::configure(ICLTensor *in_out)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), in_out);
-}
-
-void CLIntegralImageVertKernel::configure(const CLCompileContext &compile_context, ICLTensor *in_out)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(in_out, 1, DataType::U32);
-
-    _in_out = in_out;
-
-    // Create kernel
-    const std::string kernel_name = std::string("integral_vertical");
-    _kernel                       = create_kernel(compile_context, kernel_name);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration_x = 8;
-    const unsigned int     num_elems_processed_per_iteration_y = in_out->info()->dimension(Window::DimY);
-
-    Window win = calculate_max_window(*in_out->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    AccessWindowRectangle in_out_access(in_out->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-
-    update_window_and_padding(win, in_out_access);
-
-    in_out_access.set_valid_region(win, in_out->info()->valid_region());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(in_out->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(in_out->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(in_out->info()->dimension(1));
-}
-
-void CLIntegralImageVertKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const size_t height = _in_out->info()->dimension(1);
-
-    Window slice = window.first_slice_window_2D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _in_out, slice);
-        _kernel.setArg<cl_uint>(idx++, height);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLIntegralImageKernel.h b/src/core/CL/kernels/CLIntegralImageKernel.h
deleted file mode 100644
index 0e40e3a..0000000
--- a/src/core/CL/kernels/CLIntegralImageKernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLINTEGRALIMAGEKERNEL_H
-#define ARM_COMPUTE_CLINTEGRALIMAGEKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface to run the horizontal pass of the integral image kernel. */
-class CLIntegralImageHorKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  An input tensor. Data types supported: U8
-     * @param[out] output Destination tensor, Data types supported: U32.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           An input tensor. Data types supported: U8
-     * @param[out] output          Destination tensor, Data types supported: U32.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-};
-
-/** Interface to run the vertical pass of the integral image kernel. */
-class CLIntegralImageVertKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLIntegralImageVertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLIntegralImageVertKernel(const CLIntegralImageVertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLIntegralImageVertKernel &operator=(const CLIntegralImageVertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLIntegralImageVertKernel(CLIntegralImageVertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLIntegralImageVertKernel &operator=(CLIntegralImageVertKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in,out] in_out The input/output tensor. Data types supported: U32
-     */
-    void configure(ICLTensor *in_out);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in,out] in_out          The input/output tensor. Data types supported: U32
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *in_out);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    ICLTensor *_in_out;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLINTEGRALIMAGEKERNEL_H */
diff --git a/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp b/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp
deleted file mode 100644
index 9845dd6..0000000
--- a/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLMagnitudePhaseKernel::CLMagnitudePhaseKernel()
-    : _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr), _run_mag(false), _run_phase(false)
-{
-}
-
-void CLMagnitudePhaseKernel::configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase,
-                                       MagnitudeType mag_type, PhaseType phase_type)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), gx, gy, magnitude, phase, mag_type, phase_type);
-}
-
-void CLMagnitudePhaseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase,
-                                       MagnitudeType mag_type, PhaseType phase_type)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON((magnitude == nullptr) && (phase == nullptr));
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, gy);
-
-    _run_mag   = (magnitude != nullptr);
-    _run_phase = (phase != nullptr);
-    if(_run_mag)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::S16, DataType::S32);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, magnitude);
-    }
-    if(_run_phase)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
-    }
-
-    if(!_run_mag && !_run_phase)
-    {
-        ARM_COMPUTE_ERROR("At least one output must be NOT NULL");
-    }
-
-    _gx        = gx;
-    _gy        = gy;
-    _magnitude = magnitude;
-    _phase     = phase;
-
-    // Construct kernel name
-    std::set<std::string> build_opts = {};
-
-    // Add magnitude type
-    if(_run_mag)
-    {
-        switch(mag_type)
-        {
-            case MagnitudeType::L1NORM:
-                build_opts.insert("-DMAGNITUDE=1");
-                break;
-            case MagnitudeType::L2NORM:
-                build_opts.insert("-DMAGNITUDE=2");
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Unsupported magnitude calculation type.");
-                build_opts.insert("-DMAGNITUDE=0");
-                break;
-        }
-    }
-
-    // Add phase type
-    if(_run_phase)
-    {
-        switch(phase_type)
-        {
-            case PhaseType::UNSIGNED:
-                build_opts.insert("-DPHASE=1");
-                break;
-            case PhaseType::SIGNED:
-                build_opts.insert("-DPHASE=2");
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Unsupported phase calculation type.");
-                build_opts.insert("-DPHASE=0");
-                break;
-        }
-    }
-
-    // Add data_type
-    build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(gx->info()->data_type()));
-
-    // Create kernel
-    const std::string kernel_name = std::string("magnitude_phase");
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-    Window win = calculate_max_window(*gx->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal gx_access(gx->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal gy_access(gy->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_magnitude_access(magnitude == nullptr ? nullptr : magnitude->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_phase_access(phase == nullptr ? nullptr : phase->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win,
-                              gx_access, gy_access,
-                              output_magnitude_access, output_phase_access);
-
-    ValidRegion valid_region = intersect_valid_regions(gx->info()->valid_region(),
-                                                       gy->info()->valid_region());
-    output_magnitude_access.set_valid_region(win, valid_region);
-    output_phase_access.set_valid_region(win, valid_region);
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(gx->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gx->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gx->info()->dimension(1));
-}
-
-void CLMagnitudePhaseKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _gx, slice);
-        add_2D_tensor_argument(idx, _gy, slice);
-        add_2D_tensor_argument_if((_run_mag), idx, _magnitude, slice);
-        add_2D_tensor_argument_if((_run_phase), idx, _phase, slice);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLMagnitudePhaseKernel.h b/src/core/CL/kernels/CLMagnitudePhaseKernel.h
deleted file mode 100644
index 514036b..0000000
--- a/src/core/CL/kernels/CLMagnitudePhaseKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMAGNITUDEPHASEKERNEL_H
-#define ARM_COMPUTE_CLMAGNITUDEPHASEKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Template interface for the kernel to compute magnitude and phase.
- *
- */
-class CLMagnitudePhaseKernel : public ICLKernel
-{
-public:
-    /** Default constructor. */
-    CLMagnitudePhaseKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMagnitudePhaseKernel(const CLMagnitudePhaseKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMagnitudePhaseKernel &operator=(const CLMagnitudePhaseKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMagnitudePhaseKernel(CLMagnitudePhaseKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMagnitudePhaseKernel &operator=(CLMagnitudePhaseKernel &&) = default;
-    /** Initialise the kernel's input, output.
-     *
-     * @note At least one of output1 or output2 must be set.
-     *
-     * @param[in]  gx         The input gradient X tensor. Data types supported: S16/S32.
-     * @param[in]  gy         The input gradient Y tensor. Data types supported: S16/S32.
-     * @param[out] magnitude  (Optional) The output tensor - Magnitude. Data types supported: S16/S32.
-     * @param[out] phase      (Optional) The output tensor - Phase. Data types supported: U8.
-     * @param[in]  mag_type   (Optional) Magnitude calculation type. Default: L2NORM.
-     * @param[in]  phase_type (Optional) Phase calculation type. Default: SIGNED.
-     */
-    void configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase,
-                   MagnitudeType mag_type = MagnitudeType::L2NORM, PhaseType phase_type = PhaseType::SIGNED);
-    /** Initialise the kernel's input, output.
-     *
-     * @note At least one of output1 or output2 must be set.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  gx              The input gradient X tensor. Data types supported: S16/S32.
-     * @param[in]  gy              The input gradient Y tensor. Data types supported: S16/S32.
-     * @param[out] magnitude       (Optional) The output tensor - Magnitude. Data types supported: S16/S32.
-     * @param[out] phase           (Optional) The output tensor - Phase. Data types supported: U8.
-     * @param[in]  mag_type        (Optional) Magnitude calculation type. Default: L2NORM.
-     * @param[in]  phase_type      (Optional) Phase calculation type. Default: SIGNED.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase,
-                   MagnitudeType mag_type = MagnitudeType::L2NORM, PhaseType phase_type = PhaseType::SIGNED);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_gx;        /**< Input gradient X. */
-    const ICLTensor *_gy;        /**< Input gradient Y. */
-    ICLTensor       *_magnitude; /**< Output - Magnitude. */
-    ICLTensor       *_phase;     /**< Output - Phase. */
-    bool             _run_mag;   /**< Calculate magnitude ? */
-    bool             _run_phase; /**< Calculate phase ? */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLMAGNITUDEPHASEKERNEL_H */
diff --git a/src/core/CL/kernels/CLMeanStdDevKernel.cpp b/src/core/CL/kernels/CLMeanStdDevKernel.cpp
deleted file mode 100644
index aed6e6e..0000000
--- a/src/core/CL/kernels/CLMeanStdDevKernel.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLMeanStdDevKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <cmath>
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLMeanStdDevKernel::CLMeanStdDevKernel()
-    : _input(nullptr), _mean(nullptr), _stddev(nullptr), _global_sum(nullptr), _global_sum_squared(nullptr), _border_size(0)
-{
-}
-
-BorderSize CLMeanStdDevKernel::border_size() const
-{
-    return _border_size;
-}
-
-Status CLMeanStdDevKernel::validate(const ITensorInfo *input, float *mean, cl::Buffer *global_sum, float *stddev, cl::Buffer *global_sum_squared)
-{
-    ARM_COMPUTE_UNUSED(mean);
-    ARM_COMPUTE_UNUSED(stddev);
-    ARM_COMPUTE_UNUSED(global_sum);
-    ARM_COMPUTE_UNUSED(global_sum_squared);
-    ARM_COMPUTE_RETURN_ERROR_ON_INT64_BASE_ATOMICS_UNSUPPORTED();
-    ARM_COMPUTE_RETURN_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-
-    return Status{};
-}
-
-void CLMeanStdDevKernel::configure(const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev, cl::Buffer *global_sum_squared)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, mean, global_sum, stddev, global_sum_squared);
-}
-
-void CLMeanStdDevKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev, cl::Buffer *global_sum_squared)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, global_sum);
-    ARM_COMPUTE_ERROR_ON(stddev && nullptr == global_sum_squared);
-    ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevKernel::validate(input->info(), mean, global_sum, stddev, global_sum_squared));
-
-    _input              = input;
-    _mean               = mean;
-    _stddev             = stddev;
-    _global_sum         = global_sum;
-    _global_sum_squared = global_sum_squared;
-
-    // Create kernel
-    std::set<std::string> build_opts;
-
-    if(_stddev != nullptr)
-    {
-        build_opts.insert("-DSTDDEV");
-    }
-
-    _kernel = create_kernel(compile_context, "mean_stddev_accumulate", build_opts);
-
-    // Set fixed arguments
-    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input parameters
-
-    _kernel.setArg(idx++, static_cast<cl_uint>(input->info()->dimension(1)));
-    _kernel.setArg(idx++, *_global_sum);
-
-    if(_stddev != nullptr)
-    {
-        _kernel.setArg(idx++, *_global_sum_squared);
-    }
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration_x = 8;
-    const unsigned int     num_elems_processed_per_iteration_y = input->info()->dimension(1);
-
-    _border_size = BorderSize(ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration_x) - input->info()->dimension(0));
-
-    Window                win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-    update_window_and_padding(win, input_access);
-
-    ICLKernel::configure_internal(win);
-}
-
-void CLMeanStdDevKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    // Clear sums
-    static const cl_ulong zero = 0;
-    queue.enqueueWriteBuffer(*_global_sum, CL_FALSE, 0, sizeof(cl_ulong), &zero);
-
-    if(_stddev != nullptr)
-    {
-        queue.enqueueWriteBuffer(*_global_sum_squared, CL_FALSE, 0, sizeof(cl_ulong), &zero);
-    }
-
-    Window slice = window.first_slice_window_2D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        // Set slice step equal to height to force gws[1] to 1,
-        // as each thread calculates the sum across all rows and columns equal to the number of elements processed by each work-item
-        slice.set_dimension_step(Window::DimY, _input->info()->dimension(1));
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-
-    // Calculate mean and stddev
-    cl_ulong    global_sum         = 0;
-    cl_ulong    global_sum_squared = 0;
-    const float num_pixels         = _input->info()->dimension(0) * _input->info()->dimension(1);
-
-    queue.enqueueReadBuffer(*_global_sum, CL_TRUE, 0, sizeof(cl_ulong), static_cast<void *>(&global_sum));
-    const float mean = global_sum / num_pixels;
-    *_mean           = mean;
-
-    if(_stddev != nullptr)
-    {
-        queue.enqueueReadBuffer(*_global_sum_squared, CL_TRUE, 0, sizeof(cl_ulong), static_cast<void *>(&global_sum_squared));
-        *_stddev = std::sqrt((global_sum_squared / num_pixels) - (mean * mean));
-    }
-}
diff --git a/src/core/CL/kernels/CLMeanStdDevKernel.h b/src/core/CL/kernels/CLMeanStdDevKernel.h
deleted file mode 100644
index 179a202..0000000
--- a/src/core/CL/kernels/CLMeanStdDevKernel.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMEANSTDDEVKERNEL_H
-#define ARM_COMPUTE_CLMEANSTDDEVKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace cl
-{
-class Buffer;
-}
-
-namespace arm_compute
-{
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface for the kernel to calculate mean and standard deviation of input image pixels. */
-class CLMeanStdDevKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLMeanStdDevKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMeanStdDevKernel(const CLMeanStdDevKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMeanStdDevKernel &operator=(const CLMeanStdDevKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMeanStdDevKernel(CLMeanStdDevKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMeanStdDevKernel &operator=(CLMeanStdDevKernel &&) = default;
-    /** Initialise the kernel's input and outputs.
-     *
-     * @param[in]  input              Input image. Data types supported: U8.
-     * @param[out] mean               Input average pixel value.
-     * @param[out] global_sum         Keeps global sum of pixel values (Buffer size: 1 cl_ulong).
-     * @param[out] stddev             (Optional) Output standard deviation of pixel values.
-     * @param[out] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values (Buffer size: 1 cl_ulong).
-     */
-    void configure(const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev = nullptr, cl::Buffer *global_sum_squared = nullptr);
-    /** Initialise the kernel's input and outputs.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  input              Input image. Data types supported: U8.
-     * @param[out] mean               Input average pixel value.
-     * @param[out] global_sum         Keeps global sum of pixel values (Buffer size: 1 cl_ulong).
-     * @param[out] stddev             (Optional) Output standard deviation of pixel values.
-     * @param[out] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values (Buffer size: 1 cl_ulong).
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev = nullptr, cl::Buffer *global_sum_squared = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDevKernel.
-     *
-     * @param[in] input              Input image info. Data types supported: U8.
-     * @param[in] mean               Input average pixel value.
-     * @param[in] global_sum         Keeps global sum of pixel values.
-     * @param[in] stddev             (Optional) Output standard deviation of pixel values.
-     * @param[in] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, float *mean, cl::Buffer *global_sum, float *stddev = nullptr, cl::Buffer *global_sum_squared = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-    BorderSize border_size() const override;
-
-private:
-    const ICLImage *_input;
-    float          *_mean;
-    float          *_stddev;
-    cl::Buffer     *_global_sum;
-    cl::Buffer     *_global_sum_squared;
-    BorderSize      _border_size;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLMEANSTDDEVKERNEL_H */
diff --git a/src/core/CL/kernels/CLMedian3x3Kernel.cpp b/src/core/CL/kernels/CLMedian3x3Kernel.cpp
deleted file mode 100644
index 23a21d6..0000000
--- a/src/core/CL/kernels/CLMedian3x3Kernel.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLMedian3x3Kernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-BorderSize CLMedian3x3Kernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void CLMedian3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined);
-}
-
-void CLMedian3x3Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
-    _input  = input;
-    _output = output;
-
-    // Create kernel
-    const std::string kernel_name = std::string("non_linear_filter_box3x3");
-    _kernel                       = create_kernel(compile_context, kernel_name, { "-DMEDIAN" });
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_undefined);
-}
diff --git a/src/core/CL/kernels/CLMedian3x3Kernel.h b/src/core/CL/kernels/CLMedian3x3Kernel.h
deleted file mode 100644
index 8cc5ed7..0000000
--- a/src/core/CL/kernels/CLMedian3x3Kernel.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMEDIAN3X3KERNEL_H
-#define ARM_COMPUTE_CLMEDIAN3X3KERNEL_H
-
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the median 3x3 filter kernel.
- *
- */
-class CLMedian3x3Kernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLMEDIAN3X3KERNEL_H */
diff --git a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
deleted file mode 100644
index 675cfc1..0000000
--- a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLMinMaxLocationKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <climits>
-
-namespace arm_compute
-{
-inline int32_t FloatFlip(float val)
-{
-    static_assert(sizeof(float) == sizeof(int32_t), "Float must be same size as int32_t");
-    int32_t int_val = 0;
-
-    memcpy(&int_val, &val, sizeof(float));
-    int_val = (int_val >= 0) ? int_val : int_val ^ 0x7FFFFFFF;
-    return int_val;
-}
-
-inline float IFloatFlip(int32_t val)
-{
-    static_assert(sizeof(float) == sizeof(int32_t), "Float must be same size as int32_t");
-    float flt_val = 0.f;
-
-    val = (val >= 0) ? val : val ^ 0x7FFFFFFF;
-    memcpy(&flt_val, &val, sizeof(float));
-    return flt_val;
-}
-
-CLMinMaxKernel::CLMinMaxKernel()
-    : _input(nullptr), _min_max(), _data_type_max_min()
-{
-}
-
-void CLMinMaxKernel::configure(const ICLImage *input, cl::Buffer *min_max)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, min_max);
-}
-
-void CLMinMaxKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, cl::Buffer *min_max)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON(min_max == nullptr);
-
-    _input                                               = input;
-    _min_max                                             = min_max;
-    const unsigned int num_elems_processed_per_iteration = input->info()->dimension(0);
-
-    switch(input->info()->data_type())
-    {
-        case DataType::U8:
-            _data_type_max_min[0] = UCHAR_MAX;
-            _data_type_max_min[1] = 0;
-            break;
-        case DataType::S16:
-            _data_type_max_min[0] = SHRT_MAX;
-            _data_type_max_min[1] = SHRT_MIN;
-            break;
-        case DataType::F32:
-            _data_type_max_min[0] = FloatFlip(std::numeric_limits<float>::max());
-            _data_type_max_min[1] = FloatFlip(std::numeric_limits<float>::lowest());
-            break;
-        default:
-            ARM_COMPUTE_ERROR("You called with the wrong image data types");
-    }
-
-    // Set kernel build options
-    std::set<std::string> build_opts{ "-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()) };
-
-    if(num_elems_processed_per_iteration % max_cl_vector_width != 0)
-    {
-        build_opts.emplace("-DNON_MULTIPLE_OF_16");
-    }
-
-    if(input->info()->data_type() == DataType::F32)
-    {
-        build_opts.emplace("-DDATA_TYPE_MAX=" + support::cpp11::to_string(std::numeric_limits<float>::max()));
-        build_opts.emplace("-DDATA_TYPE_MIN=" + support::cpp11::to_string(std::numeric_limits<float>::lowest()));
-        build_opts.emplace("-DIS_DATA_TYPE_FLOAT");
-    }
-    else
-    {
-        build_opts.emplace("-DDATA_TYPE_MAX=" + support::cpp11::to_string(_data_type_max_min[0]));
-        build_opts.emplace("-DDATA_TYPE_MIN=" + support::cpp11::to_string(_data_type_max_min[1]));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "minmax", build_opts);
-
-    // Set fixed arguments
-    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg(idx++, *_min_max);
-    _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(input->info()->dimension(0)));
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, ceil_to_multiple(num_elems_processed_per_iteration, 16)));
-    ICLKernel::configure_internal(win);
-}
-
-void CLMinMaxKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    // Reset mininum and maximum values
-    queue.enqueueWriteBuffer(*_min_max, CL_FALSE /* blocking */, 0, _data_type_max_min.size() * sizeof(int), _data_type_max_min.data());
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-
-    cl_int min = 0;
-    cl_int max = 0;
-    queue.enqueueReadBuffer(*_min_max, CL_TRUE /* blocking */, 0 * sizeof(cl_int), sizeof(cl_int), static_cast<int *>(&min));
-    queue.enqueueReadBuffer(*_min_max, CL_TRUE /* blocking */, 1 * sizeof(cl_int), sizeof(cl_int), static_cast<int *>(&max));
-
-    if(_input->info()->data_type() == DataType::F32)
-    {
-        std::array<float, 2> min_max =
-        {
-            {
-                IFloatFlip(min),
-                IFloatFlip(max)
-            }
-        };
-        queue.enqueueWriteBuffer(*_min_max, CL_TRUE /* blocking */, 0, min_max.size() * sizeof(float), min_max.data());
-    }
-    else
-    {
-        std::array<int32_t, 2> min_max = { { min, max } };
-        queue.enqueueWriteBuffer(*_min_max, CL_TRUE /* blocking */, 0, min_max.size() * sizeof(int32_t), min_max.data());
-    }
-}
-
-CLMinMaxLocationKernel::CLMinMaxLocationKernel()
-    : _input(nullptr), _min_max_count(nullptr)
-{
-}
-
-void CLMinMaxLocationKernel::configure(const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count, ICLCoordinates2DArray *min_loc, ICLCoordinates2DArray *max_loc)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, min_max, min_max_count, min_loc, max_loc);
-}
-
-void CLMinMaxLocationKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count, ICLCoordinates2DArray *min_loc,
-                                       ICLCoordinates2DArray *max_loc)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON(min_max == nullptr);
-    ARM_COMPUTE_ERROR_ON(min_max_count == nullptr && min_loc == nullptr && max_loc == nullptr);
-
-    _input         = input;
-    _min_max_count = min_max_count;
-
-    // Set kernel build options
-    std::set<std::string> build_opts;
-    build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.emplace((min_max_count != nullptr) ? "-DCOUNT_MIN_MAX" : "");
-    build_opts.emplace((min_loc != nullptr) ? "-DLOCATE_MIN" : "");
-    build_opts.emplace((max_loc != nullptr) ? "-DLOCATE_MAX" : "");
-    if(input->info()->data_type() == DataType::F32)
-    {
-        build_opts.emplace("-DIS_DATA_TYPE_FLOAT");
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "minmaxloc", build_opts);
-
-    // Set static arguments
-    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg(idx++, *min_max);
-    _kernel.setArg(idx++, *min_max_count);
-    if(min_loc != nullptr)
-    {
-        _kernel.setArg(idx++, min_loc->cl_buffer());
-        _kernel.setArg<cl_uint>(idx++, min_loc->max_num_values());
-    }
-    if(max_loc != nullptr)
-    {
-        _kernel.setArg(idx++, max_loc->cl_buffer());
-        _kernel.setArg<cl_uint>(idx++, max_loc->max_num_values());
-    }
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-    Window                 win                               = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-}
-
-void CLMinMaxLocationKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    static const unsigned int zero_count = 0;
-    queue.enqueueWriteBuffer(*_min_max_count, CL_FALSE, 0 * sizeof(zero_count), sizeof(zero_count), &zero_count);
-    queue.enqueueWriteBuffer(*_min_max_count, CL_FALSE, 1 * sizeof(zero_count), sizeof(zero_count), &zero_count);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLMinMaxLocationKernel.h b/src/core/CL/kernels/CLMinMaxLocationKernel.h
deleted file mode 100644
index 2196abe..0000000
--- a/src/core/CL/kernels/CLMinMaxLocationKernel.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMINMAXLOCATIONKERNEL_H
-#define ARM_COMPUTE_CLMINMAXLOCATIONKERNEL_H
-
-#include "arm_compute/core/CL/ICLArray.h"
-#include "src/core/CL/ICLKernel.h"
-
-#include <array>
-
-namespace arm_compute
-{
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface for the kernel to perform min max search on an image.
- */
-class CLMinMaxKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLMinMaxKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxKernel(const CLMinMaxKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxKernel &operator=(const CLMinMaxKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMinMaxKernel(CLMinMaxKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMinMaxKernel &operator=(CLMinMaxKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input   Input Image. Data types supported: U8/S16/F32.
-     * @param[out] min_max Buffer of 2 elements to store the min value at position 0 and the max value at position 1. Data type supported: S32 if input type is U8/S16, F32 if input type is F32.
-     */
-    void configure(const ICLImage *input, cl::Buffer *min_max);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input Image. Data types supported: U8/S16/F32.
-     * @param[out] min_max         Buffer of 2 elements to store the min value at position 0 and the max value at position 1. Data type supported: S32 if input type is U8/S16, F32 if input type is F32.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, cl::Buffer *min_max);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;               /**< Input image. */
-    cl::Buffer      *_min_max;             /**< Minimum/maximum value. */
-    std::array<int, 2> _data_type_max_min; /**< Maximum and minimum data type value respectively. */
-};
-
-/** Interface for the kernel to find min max locations of an image.
- */
-class CLMinMaxLocationKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLMinMaxLocationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxLocationKernel(const CLMinMaxLocationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxLocationKernel &operator=(const CLMinMaxLocationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMinMaxLocationKernel(CLMinMaxLocationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMinMaxLocationKernel &operator=(CLMinMaxLocationKernel &&) = default;
-    /** Initialise the kernel's input and outputs.
-     *
-     * @note When locations of min and max occurrences are requested, the reported number of locations is limited to the given array size.
-     *
-     * @param[in]  input         Input image. Data types supported: U8/S16/F32.
-     * @param[out] min_max       Buffer of 2 elements to store the min value at position 0 and the max value at position 1. Data type supported: S32 if input type is U8/S16, F32 if input type is F32.
-     * @param[out] min_max_count Buffer of 2 elements to store the min value occurrences at position 0 and the max value occurrences at position 1. Data type supported: S32
-     * @param[out] min_loc       (Optional) Array of Coordinates2D used to store minimum value locations.
-     * @param[out] max_loc       (Optional) Array of Coordinates2D used to store maximum value locations.
-     */
-    void configure(const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count,
-                   ICLCoordinates2DArray *min_loc = nullptr, ICLCoordinates2DArray *max_loc = nullptr);
-    /** Initialise the kernel's input and outputs.
-     *
-     * @note When locations of min and max occurrences are requested, the reported number of locations is limited to the given array size.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input image. Data types supported: U8/S16/F32.
-     * @param[out] min_max         Buffer of 2 elements to store the min value at position 0 and the max value at position 1. Data type supported: S32 if input type is U8/S16, F32 if input type is F32.
-     * @param[out] min_max_count   Buffer of 2 elements to store the min value occurrences at position 0 and the max value occurrences at position 1. Data type supported: S32
-     * @param[out] min_loc         (Optional) Array of Coordinates2D used to store minimum value locations.
-     * @param[out] max_loc         (Optional) Array of Coordinates2D used to store maximum value locations.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count,
-                   ICLCoordinates2DArray *min_loc = nullptr, ICLCoordinates2DArray *max_loc = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLImage *_input;         /**< Input image. */
-    cl::Buffer     *_min_max_count; /**< Minimum/maximum value occurrences. */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLMINMAXLOCATIONKERNEL_H */
diff --git a/src/core/CL/kernels/CLNonLinearFilterKernel.cpp b/src/core/CL/kernels/CLNonLinearFilterKernel.cpp
deleted file mode 100644
index c73acaf..0000000
--- a/src/core/CL/kernels/CLNonLinearFilterKernel.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLNonLinearFilterKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <algorithm>
-#include <cmath>
-#include <cstdlib>
-#include <set>
-#include <sstream>
-#include <string>
-
-using namespace arm_compute;
-
-CLNonLinearFilterKernel::CLNonLinearFilterKernel()
-    : _border_size(0)
-{
-}
-
-BorderSize CLNonLinearFilterKernel::border_size() const
-{
-    return _border_size;
-}
-
-void CLNonLinearFilterKernel::configure(const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function,
-                                        unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
-                                        bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, function, mask_size, pattern, mask, border_undefined);
-}
-
-void CLNonLinearFilterKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function,
-                                        unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
-                                        bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(mask_size != 3 && mask_size != 5);
-    ARM_COMPUTE_ERROR_ON_MSG(pattern == MatrixPattern::OTHER, "MatrixPattern::OTHER is not supported!");
-    ARM_COMPUTE_UNUSED(mask);
-
-    _input       = input;
-    _output      = output;
-    _border_size = BorderSize(mask_size / 2);
-
-    // Define build options
-    std::set<std::string> build_opts;
-    build_opts.emplace("-D" + string_from_non_linear_filter_function(function));
-
-    // Define kernel
-    std::string pattern_name = string_from_matrix_pattern(pattern);
-    std::transform(pattern_name.begin(), pattern_name.end(), pattern_name.begin(), ::tolower);
-    std::stringstream ss;
-    ss << "non_linear_filter_" << pattern_name << mask_size << "x" << mask_size;
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, ss.str(), build_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    const unsigned int     num_rows_read_per_iteration       = mask_size;
-
-    Window win = calculate_max_window(*input->info(), num_elems_processed_per_iteration, border_undefined, border_size());
-
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-}
diff --git a/src/core/CL/kernels/CLNonLinearFilterKernel.h b/src/core/CL/kernels/CLNonLinearFilterKernel.h
deleted file mode 100644
index ed42063..0000000
--- a/src/core/CL/kernels/CLNonLinearFilterKernel.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLNONLINEARFILTERKERNEL_H
-#define ARM_COMPUTE_CLNONLINEARFILTERKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to apply a non-linear filter */
-class CLNonLinearFilterKernel : public ICLSimple2DKernel
-{
-public:
-    /** Default constructor */
-    CLNonLinearFilterKernel();
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8
-     * @param[out] output           Destination tensor. Data types supported: U8
-     * @param[in]  function         Non linear function to perform
-     * @param[in]  mask_size        Mask size. Supported sizes: 3, 5
-     * @param[in]  pattern          Mask pattern
-     * @param[in]  mask             The given mask. Will be used only if pattern is specified to PATTERN_OTHER
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function,
-                   unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
-                   bool border_undefined);
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8
-     * @param[out] output           Destination tensor. Data types supported: U8
-     * @param[in]  function         Non linear function to perform
-     * @param[in]  mask_size        Mask size. Supported sizes: 3, 5
-     * @param[in]  pattern          Mask pattern
-     * @param[in]  mask             The given mask. Will be used only if pattern is specified to PATTERN_OTHER
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function,
-                   unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
-                   bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-
-private:
-    BorderSize _border_size; /**< Border size */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLNONLINEARFILTERKERNEL_H */
diff --git a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp
deleted file mode 100644
index 7d5c5ba..0000000
--- a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-BorderSize CLNonMaximaSuppression3x3Kernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void CLNonMaximaSuppression3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined);
-}
-
-void CLNonMaximaSuppression3x3Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F32);
-
-    _input  = input;
-    _output = output;
-
-    // Create kernel
-    std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) };
-    _kernel                          = create_kernel(compile_context, "non_max_suppression", build_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-}
diff --git a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h
deleted file mode 100644
index d9ed60c..0000000
--- a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLNONMAXIMASUPPRESSION3x3KERNEL_H
-#define ARM_COMPUTE_CLNONMAXIMASUPPRESSION3x3KERNEL_H
-
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface to perform Non-Maxima suppression over a 3x3 window using OpenCL
- *
- * @note Used by @ref CLFastCorners and @ref CLHarrisCorners
- */
-class CLNonMaximaSuppression3x3Kernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's sources, destinations and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8, F32. (Must be the same as the output tensor)
-     * @param[out] output           Destination tensor. Data types supported: U8, F32. (Must be the same as the input tensor)
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /** Initialise the kernel's sources, destinations and border mode.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8, F32. (Must be the same as the output tensor)
-     * @param[out] output           Destination tensor. Data types supported: U8, F32. (Must be the same as the input tensor)
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLNONMAXIMASUPPRESSION3x3KERNEL_H */
diff --git a/src/core/CL/kernels/CLScharr3x3Kernel.cpp b/src/core/CL/kernels/CLScharr3x3Kernel.cpp
deleted file mode 100644
index 7ceddc9..0000000
--- a/src/core/CL/kernels/CLScharr3x3Kernel.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLScharr3x3Kernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLScharr3x3Kernel::CLScharr3x3Kernel()
-    : _run_scharr_x(false), _run_scharr_y(false), _input(nullptr), _output_x(nullptr), _output_y(nullptr)
-{
-}
-
-BorderSize CLScharr3x3Kernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void CLScharr3x3Kernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_undefined);
-}
-
-void CLScharr3x3Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
-    _run_scharr_x = output_x != nullptr;
-    _run_scharr_y = output_y != nullptr;
-
-    if(_run_scharr_x)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
-    }
-
-    if(_run_scharr_y)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
-    }
-
-    _input    = input;
-    _output_x = output_x;
-    _output_y = output_y;
-
-    // Set build options
-    std::set<std::string> build_opts;
-
-    if(_run_scharr_x)
-    {
-        build_opts.insert("-DGRAD_X");
-    }
-
-    if(_run_scharr_y)
-    {
-        build_opts.insert("-DGRAD_Y");
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "scharr3x3", build_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
-    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_access, output_x_access, output_y_access);
-
-    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-}
-
-void CLScharr3x3Kernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        add_2D_tensor_argument_if((_run_scharr_x), idx, _output_x, slice);
-        add_2D_tensor_argument_if((_run_scharr_y), idx, _output_y, slice);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLScharr3x3Kernel.h b/src/core/CL/kernels/CLScharr3x3Kernel.h
deleted file mode 100644
index a670da5..0000000
--- a/src/core/CL/kernels/CLScharr3x3Kernel.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSCHARR3X3KERNEL_H
-#define ARM_COMPUTE_CLSCHARR3X3KERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run a 3x3 Scharr filter on a tensor.
- *
- * @f[
- *      \mathbf{G}_x=\begin{vmatrix}
- *      -3 & 0 & +3\\
- *      -10& 0 & +10\\
- *      -3 & 0 & +3
- *      \end{vmatrix}
- * @f]
- * @f[
- *      \mathbf{G}_y=\begin{vmatrix}
- *      -3 & -10 & -3\\
- *       0 & 0 & 0\\
- *      +3 & +10 & +3
- *      \end{vmatrix}
- * @f]
- */
-class CLScharr3x3Kernel : public ICLKernel
-{
-public:
-    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
-    CLScharr3x3Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLScharr3x3Kernel(const CLScharr3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLScharr3x3Kernel &operator=(const CLScharr3x3Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLScharr3x3Kernel(CLScharr3x3Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLScharr3x3Kernel &operator=(CLScharr3x3Kernel &&) = default;
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    bool             _run_scharr_x; /**< Do we need to run Scharr X ? */
-    bool             _run_scharr_y; /**< Do we need to run Scharr Y ? */
-    const ICLTensor *_input;        /**< Input image */
-    ICLTensor       *_output_x;     /**< Output image for scharr X */
-    ICLTensor       *_output_y;     /**< Output image for scharr Y */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLSCHARR3X3KERNEL_H */
diff --git a/src/core/CL/kernels/CLSobel3x3Kernel.cpp b/src/core/CL/kernels/CLSobel3x3Kernel.cpp
deleted file mode 100644
index a87677a..0000000
--- a/src/core/CL/kernels/CLSobel3x3Kernel.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLSobel3x3Kernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLSobel3x3Kernel::CLSobel3x3Kernel()
-    : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
-{
-}
-
-BorderSize CLSobel3x3Kernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void CLSobel3x3Kernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_undefined);
-}
-
-void CLSobel3x3Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
-    _run_sobel_x = output_x != nullptr;
-    _run_sobel_y = output_y != nullptr;
-
-    if(_run_sobel_x)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
-    }
-
-    if(_run_sobel_y)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
-    }
-
-    _input    = input;
-    _output_x = output_x;
-    _output_y = output_y;
-
-    // Set build options
-    std::set<std::string> build_opts;
-
-    if(_run_sobel_x)
-    {
-        build_opts.insert("-DGRAD_X");
-    }
-
-    if(_run_sobel_y)
-    {
-        build_opts.insert("-DGRAD_Y");
-    }
-
-    // Create kernel
-    const std::string kernel_name = std::string("sobel3x3");
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
-    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_access, output_x_access, output_y_access);
-
-    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_undefined);
-}
-
-void CLSobel3x3Kernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        add_2D_tensor_argument_if((_run_sobel_x), idx, _output_x, slice);
-        add_2D_tensor_argument_if((_run_sobel_y), idx, _output_y, slice);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLSobel3x3Kernel.h b/src/core/CL/kernels/CLSobel3x3Kernel.h
deleted file mode 100644
index fed8068..0000000
--- a/src/core/CL/kernels/CLSobel3x3Kernel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSOBEL3X3KERNEL_H
-#define ARM_COMPUTE_CLSOBEL3X3KERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run a 3x3 Sobel filter on a tensor. */
-class CLSobel3x3Kernel : public ICLKernel
-{
-public:
-    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
-    CLSobel3x3Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel3x3Kernel(const CLSobel3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel3x3Kernel &operator=(const CLSobel3x3Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLSobel3x3Kernel(CLSobel3x3Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLSobel3x3Kernel &operator=(CLSobel3x3Kernel &&) = default;
-    /** Default destructor */
-    ~CLSobel3x3Kernel() = default;
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input;       /**< Input tensor */
-    ICLTensor       *_output_x;    /**< Output tensor for Sobel X */
-    ICLTensor       *_output_y;    /**< Output tensor for Sobel Y */
-    bool             _run_sobel_x; /**< Do we need to run Sobel X ? */
-    bool             _run_sobel_y; /**< Do we need to run Sobel Y ? */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLSOBEL3X3KERNEL_H */
diff --git a/src/core/CL/kernels/CLSobel5x5Kernel.cpp b/src/core/CL/kernels/CLSobel5x5Kernel.cpp
deleted file mode 100644
index c450bec..0000000
--- a/src/core/CL/kernels/CLSobel5x5Kernel.cpp
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLSobel5x5Kernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLSobel5x5HorKernel::CLSobel5x5HorKernel()
-    : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
-{
-}
-
-BorderSize CLSobel5x5HorKernel::border_size() const
-{
-    return _border_size;
-}
-
-void CLSobel5x5HorKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_undefined);
-}
-
-void CLSobel5x5HorKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
-    _run_sobel_x = output_x != nullptr;
-    _run_sobel_y = output_y != nullptr;
-
-    if(_run_sobel_x)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
-    }
-
-    if(_run_sobel_y)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
-    }
-
-    _input       = input;
-    _output_x    = output_x;
-    _output_y    = output_y;
-    _border_size = BorderSize(border_undefined ? 0 : 2, 2);
-
-    // Set build options
-    std::set<std::string> build_opts;
-
-    if(_run_sobel_x)
-    {
-        build_opts.insert("-DGRAD_X");
-    }
-
-    if(_run_sobel_y)
-    {
-        build_opts.insert("-DGRAD_Y");
-    }
-
-    // Create kernel
-    const std::string kernel_name = std::string("sobel_separable1x5");
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-
-    Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration);
-    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
-    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_access, output_x_access, output_y_access);
-
-    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_undefined);
-}
-
-void CLSobel5x5HorKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        add_2D_tensor_argument_if((_run_sobel_x), idx, _output_x, slice);
-        add_2D_tensor_argument_if((_run_sobel_y), idx, _output_y, slice);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
-
-CLSobel5x5VertKernel::CLSobel5x5VertKernel()
-    : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
-{
-}
-
-BorderSize CLSobel5x5VertKernel::border_size() const
-{
-    return BorderSize{ 2, 0 };
-}
-
-void CLSobel5x5VertKernel::configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input_x, input_y, output_x, output_y, border_undefined);
-}
-
-void CLSobel5x5VertKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
-    _run_sobel_x = output_x != nullptr;
-    _run_sobel_y = output_y != nullptr;
-
-    if(_run_sobel_x)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_x, 1, DataType::S16);
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
-    }
-
-    if(_run_sobel_y)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_y, 1, DataType::S16);
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
-    }
-
-    _input_x  = input_x;
-    _input_y  = input_y;
-    _output_x = output_x;
-    _output_y = output_y;
-
-    // Set build options
-    std::set<std::string> build_opts;
-
-    if(_run_sobel_x)
-    {
-        build_opts.insert("-DGRAD_X");
-    }
-
-    if(_run_sobel_y)
-    {
-        build_opts.insert("-DGRAD_Y");
-    }
-
-    // Create kernel
-    const std::string kernel_name = std::string("sobel_separable5x1");
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts);
-
-    const ICLTensor *input = _run_sobel_x ? _input_x : _input_y;
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 8;
-    constexpr unsigned int num_rows_read_per_iteration       = 5;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_x_access(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowRectangle  input_y_access(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
-    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_x_access, input_y_access, output_x_access, output_y_access);
-
-    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_undefined);
-}
-
-void CLSobel5x5VertKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument_if((_run_sobel_x), idx, _input_x, slice);
-        add_2D_tensor_argument_if((_run_sobel_x), idx, _output_x, slice);
-        add_2D_tensor_argument_if((_run_sobel_y), idx, _input_y, slice);
-        add_2D_tensor_argument_if((_run_sobel_y), idx, _output_y, slice);
-
-        _kernel.setArg(idx++, 0 /*dummy*/);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLSobel5x5Kernel.h b/src/core/CL/kernels/CLSobel5x5Kernel.h
deleted file mode 100644
index a163ac9..0000000
--- a/src/core/CL/kernels/CLSobel5x5Kernel.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSOBEL5X5KERNEL_H
-#define ARM_COMPUTE_CLSOBEL5X5KERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run the horizontal pass of 5x5 Sobel filter on a tensor. */
-class CLSobel5x5HorKernel : public ICLKernel
-{
-public:
-    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
-    CLSobel5x5HorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel5x5HorKernel(const CLSobel5x5HorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel5x5HorKernel &operator=(const CLSobel5x5HorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLSobel5x5HorKernel(CLSobel5x5HorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLSobel5x5HorKernel &operator=(CLSobel5x5HorKernel &&) = default;
-    /** Default destructor */
-    ~CLSobel5x5HorKernel() = default;
-
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input;       /**< Input tensor */
-    ICLTensor       *_output_x;    /**< X output of horizontal pass */
-    ICLTensor       *_output_y;    /**< Y output of horizontal pass */
-    bool             _run_sobel_x; /**< Do we need to run Sobel X ? */
-    bool             _run_sobel_y; /**< Do we need to run Sobel Y ? */
-    BorderSize       _border_size; /**< Border size */
-};
-
-/** Interface for the kernel to run the vertical pass of 5x5 Sobel filter on a tensor. */
-class CLSobel5x5VertKernel : public ICLKernel
-{
-public:
-    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
-    CLSobel5x5VertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel5x5VertKernel(const CLSobel5x5VertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel5x5VertKernel &operator=(const CLSobel5x5VertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLSobel5x5VertKernel(CLSobel5x5VertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLSobel5x5VertKernel &operator=(CLSobel5x5VertKernel &&) = default;
-    /** Default destructor */
-    ~CLSobel5x5VertKernel() = default;
-
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set and the corresponding input.
-     *
-     * @param[in]  input_x          (Optional) Input for X (X output of horizontal pass). Data types supported: S16.
-     * @param[in]  input_y          (Optional) Input for Y (Y output of horizontal pass). Data types supported: S16.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set and the corresponding input.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input_x          (Optional) Input for X (X output of horizontal pass). Data types supported: S16.
-     * @param[in]  input_y          (Optional) Input for Y (Y output of horizontal pass). Data types supported: S16.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input_x;     /**< X input (X output of the horizontal pass) */
-    const ICLTensor *_input_y;     /**< Y input (Y output of the horizontal pass) */
-    ICLTensor       *_output_x;    /**< X output of sobel */
-    ICLTensor       *_output_y;    /**< Y output of sobel */
-    bool             _run_sobel_x; /**< Do we need to run sobel X? */
-    bool             _run_sobel_y; /**< Do we need to run sobel Y? */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLSOBEL5X5KERNEL_H */
diff --git a/src/core/CL/kernels/CLSobel7x7Kernel.cpp b/src/core/CL/kernels/CLSobel7x7Kernel.cpp
deleted file mode 100644
index 1cfa74f..0000000
--- a/src/core/CL/kernels/CLSobel7x7Kernel.cpp
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLSobel7x7Kernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLSobel7x7HorKernel::CLSobel7x7HorKernel()
-    : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
-{
-}
-
-BorderSize CLSobel7x7HorKernel::border_size() const
-{
-    return _border_size;
-}
-
-void CLSobel7x7HorKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_undefined);
-}
-
-void CLSobel7x7HorKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
-    _run_sobel_x = output_x != nullptr;
-    _run_sobel_y = output_y != nullptr;
-
-    if(_run_sobel_x)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S32);
-    }
-
-    if(_run_sobel_y)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S32);
-    }
-
-    _input       = input;
-    _output_x    = output_x;
-    _output_y    = output_y;
-    _border_size = BorderSize(border_undefined ? 0 : 3, 3);
-
-    // Construct kernel name
-    const std::string kernel_name = "sobel_separable1x7";
-
-    // Set build options
-    std::set<std::string> build_opts;
-
-    if(_run_sobel_x)
-    {
-        build_opts.insert("-DGRAD_X");
-    }
-
-    if(_run_sobel_y)
-    {
-        build_opts.insert("-DGRAD_Y");
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-
-    Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration);
-    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
-    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_access, output_x_access, output_y_access);
-
-    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_undefined);
-}
-
-void CLSobel7x7HorKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        add_2D_tensor_argument_if((_run_sobel_x), idx, _output_x, slice);
-        add_2D_tensor_argument_if((_run_sobel_y), idx, _output_y, slice);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
-
-CLSobel7x7VertKernel::CLSobel7x7VertKernel()
-    : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
-{
-}
-
-BorderSize CLSobel7x7VertKernel::border_size() const
-{
-    return BorderSize{ 3, 0 };
-}
-
-void CLSobel7x7VertKernel::configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input_x, input_y, output_x, output_y, border_undefined);
-}
-
-void CLSobel7x7VertKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
-    _run_sobel_x = output_x != nullptr;
-    _run_sobel_y = output_y != nullptr;
-
-    if(_run_sobel_x)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_x, 1, DataType::S32);
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S32);
-    }
-
-    if(_run_sobel_y)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_y, 1, DataType::S32);
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S32);
-    }
-
-    _input_x  = input_x;
-    _input_y  = input_y;
-    _output_x = output_x;
-    _output_y = output_y;
-
-    // Set build options
-    std::set<std::string> build_opts;
-
-    if(_run_sobel_x)
-    {
-        build_opts.insert("-DGRAD_X");
-    }
-
-    if(_run_sobel_y)
-    {
-        build_opts.insert("-DGRAD_Y");
-    }
-
-    // Create kernel
-    const std::string kernel_name = std::string("sobel_separable7x1");
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts);
-
-    const ICLTensor *input = _run_sobel_x ? _input_x : _input_y;
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 8;
-    constexpr unsigned int num_rows_read_per_iteration       = 7;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  input_x_access(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowRectangle  input_y_access(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
-    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_x_access, input_y_access, output_x_access, output_y_access);
-
-    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_undefined);
-}
-
-void CLSobel7x7VertKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_2D();
-
-    do
-    {
-        unsigned int idx = 0;
-
-        add_2D_tensor_argument_if((_run_sobel_x), idx, _input_x, slice);
-        add_2D_tensor_argument_if((_run_sobel_x), idx, _output_x, slice);
-        add_2D_tensor_argument_if((_run_sobel_y), idx, _input_y, slice);
-        add_2D_tensor_argument_if((_run_sobel_y), idx, _output_y, slice);
-
-        _kernel.setArg(idx++, 0 /*dummy*/);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLSobel7x7Kernel.h b/src/core/CL/kernels/CLSobel7x7Kernel.h
deleted file mode 100644
index c85f0ae..0000000
--- a/src/core/CL/kernels/CLSobel7x7Kernel.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSOBEL7X7KERNEL_H
-#define ARM_COMPUTE_CLSOBEL7X7KERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run the horizontal pass of 7x7 Sobel filter on a tensor. */
-class CLSobel7x7HorKernel : public ICLKernel
-{
-public:
-    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
-    CLSobel7x7HorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel7x7HorKernel(const CLSobel7x7HorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel7x7HorKernel &operator=(const CLSobel7x7HorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLSobel7x7HorKernel(CLSobel7x7HorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLSobel7x7HorKernel &operator=(CLSobel7x7HorKernel &&) = default;
-    /** Default destructor */
-    ~CLSobel7x7HorKernel() = default;
-
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S32.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S32.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S32.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S32.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input;       /**< Input tensor */
-    ICLTensor       *_output_x;    /**< X output of horizontal pass */
-    ICLTensor       *_output_y;    /**< Y output of horizontal pass */
-    bool             _run_sobel_x; /**< Do we need to run Sobel X ? */
-    bool             _run_sobel_y; /**< Do we need to run Sobel Y ? */
-    BorderSize       _border_size; /**< Border size */
-};
-
-/** Interface for the kernel to run the vertical pass of 7x7 Sobel filter on a tensor. */
-class CLSobel7x7VertKernel : public ICLKernel
-{
-public:
-    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
-    CLSobel7x7VertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel7x7VertKernel(const CLSobel7x7VertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel7x7VertKernel &operator=(const CLSobel7x7VertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLSobel7x7VertKernel(CLSobel7x7VertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLSobel7x7VertKernel &operator=(CLSobel7x7VertKernel &&) = default;
-    /** Default destructor */
-    ~CLSobel7x7VertKernel() = default;
-
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set and the corresponding input.
-     *
-     * @param[in]  input_x          (Optional) Input for X (X output of horizontal pass). Data types supported: S32.
-     * @param[in]  input_y          (Optional) Input for Y (Y output of horizontal pass). Data types supported: S32.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S32.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S32.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set and the corresponding input.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input_x          (Optional) Input for X (X output of horizontal pass). Data types supported: S32.
-     * @param[in]  input_y          (Optional) Input for Y (Y output of horizontal pass). Data types supported: S32.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S32.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S32.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input_x;     /**< X input (X output of the horizontal pass) */
-    const ICLTensor *_input_y;     /**< Y input (Y output of the horizontal pass) */
-    ICLTensor       *_output_x;    /**< X output of sobel */
-    ICLTensor       *_output_y;    /**< Y output of sobel */
-    bool             _run_sobel_x; /**< Do we need to run sobel X? */
-    bool             _run_sobel_y; /**< Do we need to run sobel Y? */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLSOBEL7X7KERNEL_H */
diff --git a/src/core/CL/kernels/CLTableLookupKernel.cpp b/src/core/CL/kernels/CLTableLookupKernel.cpp
deleted file mode 100644
index b82f4c9..0000000
--- a/src/core/CL/kernels/CLTableLookupKernel.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLTableLookupKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLLut.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-
-#include <cstdint>
-#include <string>
-
-using namespace arm_compute;
-
-void CLTableLookupKernel::configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, lut, output);
-}
-
-void CLTableLookupKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLLut *lut, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON(lut == nullptr);
-    ARM_COMPUTE_ERROR_ON(DataType::U8 != lut->type() && DataType::S16 != lut->type());
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-    // Create kernel
-    std::string kernel_name = (DataType::S16 == lut->type()) ? "tablelookup_S16" : "tablelookup_U8";
-    _kernel                 = create_kernel(compile_context, kernel_name);
-
-    // Set lut argument
-    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg(idx++, lut->cl_buffer());
-    if(DataType::S16 == lut->type())
-    {
-        _kernel.setArg(idx++, lut->index_offset());
-        _kernel.setArg(idx++, static_cast<uint32_t>(lut->num_elements()));
-    }
-
-    // Configure kernel
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
-}
diff --git a/src/core/CL/kernels/CLTableLookupKernel.h b/src/core/CL/kernels/CLTableLookupKernel.h
deleted file mode 100644
index c8d15cb..0000000
--- a/src/core/CL/kernels/CLTableLookupKernel.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLTABLELOOKUPKERNEL_H
-#define ARM_COMPUTE_CLTABLELOOKUPKERNEL_H
-
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-class ICLLut;
-
-/** Interface for the kernel to perform table lookup calculations. */
-class CLTableLookupKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input, lut and output.
-     *
-     * @param[in]  input  An input tensor. Data types supported: U8, S16.
-     * @param[in]  lut    The input LUT. Data types supported: U8, S16.
-     * @param[out] output The output tensor. Data types supported: U8, S16.
-     */
-    void configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output);
-    /** Initialise the kernel's input, lut and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           An input tensor. Data types supported: U8, S16.
-     * @param[in]  lut             The input LUT. Data types supported: U8, S16.
-     * @param[out] output          The output tensor. Data types supported: U8, S16.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLLut *lut, ICLTensor *output);
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLTABLELOOKUPKERNEL_H */
diff --git a/src/core/CL/kernels/CLThresholdKernel.cpp b/src/core/CL/kernels/CLThresholdKernel.cpp
deleted file mode 100644
index 72c22f0..0000000
--- a/src/core/CL/kernels/CLThresholdKernel.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLThresholdKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <string>
-
-namespace arm_compute
-{
-void CLThresholdKernel::configure(const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
-}
-
-void CLThresholdKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
-    // Construct kernel name
-    std::string kernel_name = "threshold";
-
-    switch(info.type)
-    {
-        case ThresholdType::BINARY:
-            kernel_name += "_binary";
-            break;
-        case ThresholdType::RANGE:
-            kernel_name += "_range";
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Thresholding type not recognized");
-            break;
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name);
-
-    // Set arguments
-    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg(idx++, info.false_value);
-    _kernel.setArg(idx++, info.true_value);
-    _kernel.setArg(idx++, info.threshold);
-
-    if(ThresholdType::RANGE == info.type)
-    {
-        _kernel.setArg(idx++, info.upper);
-    }
-
-    // Make sure _kernel is initialized before calling the parent's configure
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-    ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLThresholdKernel.h b/src/core/CL/kernels/CLThresholdKernel.h
deleted file mode 100644
index 511eaed..0000000
--- a/src/core/CL/kernels/CLThresholdKernel.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLTHRESHOLDKERNEL_H
-#define ARM_COMPUTE_CLTHRESHOLDKERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** Interface for the thresholding kernel. */
-class CLThresholdKernel : public ICLSimple2DKernel
-{
-public:
-    /**Initialise the kernel's input, output and threshold parameters.
-     *
-     * @param[in]  input  An input tensor. Data types supported: U8
-     * @param[out] output The output tensor. Data types supported: U8.
-     * @param[in]  info   Threshold descriptor
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info);
-    /**Initialise the kernel's input, output and threshold parameters.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           An input tensor. Data types supported: U8
-     * @param[out] output          The output tensor. Data types supported: U8.
-     * @param[in]  info            Threshold descriptor
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info);
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NETHRESHOLDKERNEL_H */
diff --git a/src/core/CL/kernels/CLWarpAffineKernel.cpp b/src/core/CL/kernels/CLWarpAffineKernel.cpp
deleted file mode 100644
index 600c67a..0000000
--- a/src/core/CL/kernels/CLWarpAffineKernel.cpp
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLWarpAffineKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-#include <set>
-#include <sstream>
-#include <string>
-
-namespace arm_compute
-{
-namespace
-{
-void options_add_matrix(std::set<std::string> &options, const std::array<float, 9> &matrix)
-{
-    for(size_t i = 0; i < 6; ++i)
-    {
-        std::stringstream mat_str;
-        mat_str << "-DMAT" << i << "=" << matrix[i] << " ";
-        options.insert(mat_str.str());
-    }
-}
-} // namespace
-
-BorderSize CLWarpAffineKernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void CLWarpAffineKernel::configure(const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, matrix, policy);
-}
-
-void CLWarpAffineKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(InterpolationPolicy::AREA == policy);
-
-    _input  = input;
-    _output = output;
-
-    // Create build options
-    std::set<std::string> options;
-    options_add_matrix(options, matrix);
-    options.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-
-    // Create kernel
-    std::string interpolation_name = string_from_interpolation_policy(policy);
-    std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
-    const std::string kernel_name = "warp_affine_" + interpolation_name;
-    _kernel                       = create_kernel(compile_context, kernel_name, options);
-
-    // Set static kernel arguments
-    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg<cl_int>(idx++, input->info()->dimension(0));
-    _kernel.setArg<cl_int>(idx++, input->info()->dimension(1));
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
-
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-
-    int       total_right  = ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration);
-    const int access_right = total_right + (((total_right - input->info()->dimension(0)) == 0) ? border_size().right : 0);
-
-    AccessWindowStatic     input_access(input->info(), -border_size().left, -border_size().top, access_right, input->info()->dimension(1) + border_size().bottom);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(3));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(3));
-    _config_id += "_";
-    _config_id += lower_string(string_from_interpolation_policy(policy));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLWarpAffineKernel.h b/src/core/CL/kernels/CLWarpAffineKernel.h
deleted file mode 100644
index c600ee7..0000000
--- a/src/core/CL/kernels/CLWarpAffineKernel.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWARPAFFINEKERNEL_H
-#define ARM_COMPUTE_CLWARPAFFINEKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the warp affine kernel.*/
-class CLWarpAffineKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialize the function's source, destination, interpolation policy and border_mode.
-     *
-     * @param[in]  input  Source tensor. Data types supported: U8.
-     * @param[out] output Destination tensor, Data types supported: U8.
-     * @param[in]  matrix The perspective matrix. Must be 2x3 of type float
-     *                    The matrix argument requires 9 values, the last 3 values are ignored.
-     * @param[in]  policy The interpolation type.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy);
-    /** Initialize the function's source, destination, interpolation policy and border_mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8.
-     * @param[out] output          Destination tensor, Data types supported: U8.
-     * @param[in]  matrix          The perspective matrix. Must be 2x3 of type float
-     *                             The matrix argument requires 9 values, the last 3 values are ignored.
-     * @param[in]  policy          The interpolation type.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLWARPAFFINEKERNEL_H */
diff --git a/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
deleted file mode 100644
index 5f20a0b..0000000
--- a/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLWarpPerspectiveKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <cstddef>
-#include <set>
-#include <sstream>
-#include <string>
-
-using namespace arm_compute;
-
-namespace
-{
-inline void options_add_matrix(std::set<std::string> &options, const std::array<float, 9> &matrix)
-{
-    for(size_t i = 0; i < 9; ++i)
-    {
-        std::stringstream mat_str;
-        mat_str << "-DMAT" << i << "=" << matrix[i] << " ";
-        options.insert(mat_str.str());
-    }
-}
-} // namespace
-
-BorderSize CLWarpPerspectiveKernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void CLWarpPerspectiveKernel::configure(const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, matrix, policy);
-}
-
-void CLWarpPerspectiveKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(InterpolationPolicy::AREA == policy);
-
-    _input  = input;
-    _output = output;
-
-    // Create build options
-    std::set<std::string> options;
-    options_add_matrix(options, matrix);
-    options.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-
-    // Create kernel
-    std::string interpolation_name = string_from_interpolation_policy(policy);
-    std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
-    std::string kernel_name = "warp_perspective_" + interpolation_name;
-    _kernel                 = create_kernel(compile_context, kernel_name, options);
-
-    // Set static kernel arguments
-    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg<cl_int>(idx++, input->info()->dimension(0));
-    _kernel.setArg<cl_int>(idx++, input->info()->dimension(1));
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
-
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowStatic     input_access(input->info(), -border_size().left, -border_size().top, input->info()->dimension(0) + border_size().right, input->info()->dimension(1) + border_size().bottom);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    ICLKernel::configure_internal(win);
-}
diff --git a/src/core/CL/kernels/CLWarpPerspectiveKernel.h b/src/core/CL/kernels/CLWarpPerspectiveKernel.h
deleted file mode 100644
index dcbe1c5..0000000
--- a/src/core/CL/kernels/CLWarpPerspectiveKernel.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWARPERSPECTIVEKERNEL_H
-#define ARM_COMPUTE_CLWARPERSPECTIVEKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-/** Interface for the warp perspective kernel.*/
-class CLWarpPerspectiveKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialize the function's source, destination, interpolation policy and border_mode.
-     *
-     * @param[in]  input  Source tensor. Data types supported: U8.
-     * @param[out] output Destination tensor, Data types supported: U8.
-     * @param[in]  matrix The perspective matrix. Must be 3x3 of type float.
-     * @param[in]  policy The interpolation type.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy);
-    /** Initialize the function's source, destination, interpolation policy and border_mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8.
-     * @param[out] output          Destination tensor, Data types supported: U8.
-     * @param[in]  matrix          The perspective matrix. Must be 3x3 of type float.
-     * @param[in]  policy          The interpolation type.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLWARPERSPECTIVEKERNEL_H */
diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h
index aea245c..b2c5592 100644
--- a/src/core/NEON/NEKernels.h
+++ b/src/core/NEON/NEKernels.h
@@ -35,7 +35,6 @@
 #include "src/core/NEON/kernels/NECol2ImKernel.h"
 #include "src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
 #include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "src/core/NEON/kernels/NEConvolutionKernel.h"
 #include "src/core/NEON/kernels/NECropKernel.h"
 #include "src/core/NEON/kernels/NECumulativeDistributionKernel.h"
 #include "src/core/NEON/kernels/NEDepthConvertLayerKernel.h"
@@ -72,7 +71,6 @@
 #include "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h"
 #include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
 #include "src/core/NEON/kernels/NEMinMaxLayerKernel.h"
-#include "src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
 #include "src/core/NEON/kernels/NENormalizationLayerKernel.h"
 #include "src/core/NEON/kernels/NEPadLayerKernel.h"
 #include "src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
@@ -83,6 +81,7 @@
 #include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
 #include "src/core/NEON/kernels/NERangeKernel.h"
 #include "src/core/NEON/kernels/NEReductionOperationKernel.h"
+#include "src/core/NEON/kernels/NERemapKernel.h"
 #include "src/core/NEON/kernels/NEReorgLayerKernel.h"
 #include "src/core/NEON/kernels/NEReverseKernel.h"
 #include "src/core/NEON/kernels/NEScaleKernel.h"
diff --git a/src/core/NEON/kernels/NEConvolutionKernel.cpp b/src/core/NEON/kernels/NEConvolutionKernel.cpp
deleted file mode 100644
index 075de41..0000000
--- a/src/core/NEON/kernels/NEConvolutionKernel.cpp
+++ /dev/null
@@ -1,1625 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEConvolutionKernel.h"
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <algorithm>
-#include <arm_neon.h>
-#include <array>
-#include <cstdint>
-#include <cstring>
-#include <tuple>
-
-namespace arm_compute
-{
-namespace
-{
-const uint16x8_t max_int16 = vdupq_n_u16(INT16_MAX);
-
-inline void store_results(const int32x4_t &out, const int32x4_t &out2, int16_t *output)
-{
-    const int16x8_t s16results = vcombine_s16(vqmovn_s32(out),
-                                              vqmovn_s32(out2));
-    vst1q_s16(output, s16results);
-}
-
-inline void store_results(const int32x4_t &out, const int32x4_t &out2, uint8_t *output)
-{
-    const uint8x8_t u8results = vqmovn_u16(vcombine_u16(vqmovun_s32(out),
-                                                        vqmovun_s32(out2)));
-    vst1_u8(output, u8results);
-}
-
-inline void store_results(const uint32x4_t &out, const uint32x4_t &out2, int16_t *output)
-{
-    const uint16x8_t u16results = vcombine_u16(vqmovn_u32(out), vqmovn_u32(out2));
-    const int16x8_t  s16results = vreinterpretq_s16_u16(vminq_u16(u16results, max_int16));
-    vst1q_s16(output, s16results);
-}
-
-inline void store_results(const uint32x4_t &out, const uint32x4_t &out2, uint8_t *output)
-{
-    const uint8x8_t u8results = vqmovn_u16(vcombine_u16(vqmovn_u32(out),
-                                                        vqmovn_u32(out2)));
-    vst1_u8(output, u8results);
-}
-
-inline void store_results(const int16x8_t &out, const int16x8_t &out2, int16_t *output)
-{
-    vst1q_s16(output, out);
-    vst1q_s16(output + 8, out2);
-}
-
-inline void store_results(const int16x8_t &out, const int16x8_t &out2, uint8_t *output)
-{
-    const uint8x16_t u8results = vcombine_u8(vqmovun_s16(out),
-                                             vqmovun_s16(out2));
-    vst1q_u8(output, u8results);
-}
-
-inline void store_results(const uint16x8_t &out, const uint16x8_t &out2, uint8_t *output)
-{
-    const uint8x16_t u8results = vcombine_u8(vqmovn_u16(out),
-                                             vqmovn_u16(out2));
-    vst1q_u8(output, u8results);
-}
-
-inline void store_results(const uint16x8_t &out, const uint16x8_t &out2, int16_t *output)
-{
-    vst1q_s16(output, vreinterpretq_s16_u16(vminq_u16(out, max_int16)));
-    vst1q_s16(output + 8, vreinterpretq_s16_u16(vminq_u16(out2, max_int16)));
-}
-
-inline void convolve_row3x1_unrolled(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16x4_t &mat0, const int16x4_t &mat1, const int16x4_t &mat2)
-{
-    // Convert to s16 and split in blocks of 4 values:
-    const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
-    const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
-
-    const int16x4x3_t row =
-    {
-        {
-            vget_low_s16(s16_tmp0),
-            vget_high_s16(s16_tmp0),
-            vget_low_s16(s16_tmp1)
-        }
-    };
-
-    // Calculate row left value for pixels [0,3]
-    out = vmlal_s16(out, row.val[0], mat0);
-    // Calculate row middle value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
-    // Calculate row right value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
-
-    // Calculate row left value for pixels [4,7]
-    out2 = vmlal_s16(out2, row.val[1], mat0);
-    // Calculate row middle value for pixels [4,7]
-    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
-    // Calculate row right value for pixels [4,7]
-    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
-}
-
-inline void convolve_row3x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
-{
-    const int16x4_t mat0 = vld1_dup_s16(convolution);
-    const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
-    const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
-
-    convolve_row3x1_unrolled(out, out2, row_data, mat0, mat1, mat2);
-}
-
-inline void convolve_row5x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
-{
-    const int16x4_t mat0 = vld1_dup_s16(convolution);
-    const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
-    const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
-    const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
-    const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
-
-    // Convert to s16 and split in blocks of 4 values:
-    const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
-    const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
-
-    const int16x4x3_t row =
-    {
-        {
-            vget_low_s16(s16_tmp0),
-            vget_high_s16(s16_tmp0),
-            vget_low_s16(s16_tmp1)
-        }
-    };
-
-    // Calculate row left 2 value for pixels [0,3]
-    out = vmlal_s16(out, row.val[0], mat0);
-    // Calculate row left 1 value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
-    // Calculate row middle value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
-    // Calculate row right +1 value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
-    // Calculate row right +2 value for pixels [0,3]
-    out = vmlal_s16(out, row.val[1], mat4);
-
-    // Calculate row left 2 value for pixels [4,7]
-    out2 = vmlal_s16(out2, row.val[1], mat0);
-    // Calculate row left 1 value for pixels [4,7]
-    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
-    // Calculate row middle value for pixels [4,7]
-    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
-    // Calculate row right +1 value for pixels [4,7]
-    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
-    // Calculate row right +2 value for pixels [4,7]
-    out2 = vmlal_s16(out2, row.val[2], mat4);
-}
-
-inline void convolve_row7x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
-{
-    const int16x4_t mat0 = vld1_dup_s16(convolution);
-    const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
-    const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
-    const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
-    const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
-    const int16x4_t mat5 = vld1_dup_s16(convolution + 5);
-    const int16x4_t mat6 = vld1_dup_s16(convolution + 6);
-
-    // Convert to s16 and split in blocks of 4 values:
-    const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
-    const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
-
-    const int16x4x4_t row =
-    {
-        {
-            vget_low_s16(s16_tmp0),
-            vget_high_s16(s16_tmp0),
-            vget_low_s16(s16_tmp1),
-            vget_high_s16(s16_tmp1)
-        }
-    };
-
-    // Calculate row left 3 value for pixels [0,3]
-    out = vmlal_s16(out, row.val[0], mat0);
-    // Calculate row left 2 value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
-    // Calculate row left 1 value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
-    // Calculate row middle value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
-    // Calculate row right +1 value for pixels [0,3]
-    out = vmlal_s16(out, row.val[1], mat4);
-    // Calculate row right +2 value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 1), mat5);
-    // Calculate row right +3 value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 2), mat6);
-
-    // Calculate row left 3 value for pixels [4,7]
-    out2 = vmlal_s16(out2, row.val[1], mat0);
-    // Calculate row left 2 value for pixels [4,7]
-    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
-    // Calculate row left 1 value for pixels [4,7]
-    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
-    // Calculate row middle value for pixels [4,7]
-    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
-    // Calculate row right +1 value for pixels [4,7]
-    out2 = vmlal_s16(out2, row.val[2], mat4);
-    // Calculate row right +2 value for pixels [4,7]
-    out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 1), mat5);
-    // Calculate row right +3 value for pixels [4,7]
-    out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 2), mat6);
-}
-
-inline void convolve_row9x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
-{
-    const int16x4_t mat0 = vld1_dup_s16(convolution);
-    const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
-    const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
-    const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
-    const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
-    const int16x4_t mat5 = vld1_dup_s16(convolution + 5);
-    const int16x4_t mat6 = vld1_dup_s16(convolution + 6);
-    const int16x4_t mat7 = vld1_dup_s16(convolution + 7);
-    const int16x4_t mat8 = vld1_dup_s16(convolution + 8);
-
-    // Convert to s16 and split in blocks of 4 values:
-    const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
-    const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
-
-    const int16x4x4_t row =
-    {
-        {
-            vget_low_s16(s16_tmp0),
-            vget_high_s16(s16_tmp0),
-            vget_low_s16(s16_tmp1),
-            vget_high_s16(s16_tmp1)
-        }
-    };
-
-    // Calculate row left 4 value for pixels [0,3]
-    out = vmlal_s16(out, row.val[0], mat0);
-    // Calculate row left 3 value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
-    // Calculate row left 2 value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
-    // Calculate row left 1 value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
-    // Calculate row middle value for pixels [0,3]
-    out = vmlal_s16(out, row.val[1], mat4);
-    // Calculate row right +1 value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 1), mat5);
-    // Calculate row right +2 value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 2), mat6);
-    // Calculate row right +3 value for pixels [0,3]
-    out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 3), mat7);
-    // Calculate row right +4 value for pixels [0,3]
-    out = vmlal_s16(out, row.val[2], mat8);
-
-    // Calculate row left 4 value for pixels [0,3]
-    out2 = vmlal_s16(out2, row.val[1], mat0);
-    // Calculate row left 3 value for pixels [0,3]
-    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
-    // Calculate row left 2 value for pixels [0,3]
-    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
-    // Calculate row left 1 value for pixels [0,3]
-    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
-    // Calculate row middle value for pixels [0,3]
-    out2 = vmlal_s16(out2, row.val[2], mat4);
-    // Calculate row right +1 value for pixels [0,3]
-    out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 1), mat5);
-    // Calculate row right +2 value for pixels [0,3]
-    out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 2), mat6);
-    // Calculate row right +3 value for pixels [0,3]
-    out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 3), mat7);
-    // Calculate row right +4 value for pixels [0,3]
-    out2 = vmlal_s16(out2, row.val[3], mat8);
-}
-} // namespace
-
-/****************************************************************************************\
- *                                    Square Convolution                                *
-\****************************************************************************************/
-
-template <unsigned int matrix_size>
-NEConvolutionKernel<matrix_size>::NEConvolutionKernel()
-    : INESimpleKernel(), _scale(0), _convolution{ {} }
-{
-}
-
-template <unsigned int matrix_size>
-BorderSize             NEConvolutionKernel<matrix_size>::border_size() const
-{
-    return BorderSize{ matrix_size / 2 };
-}
-
-template <unsigned int matrix_size>
-void NEConvolutionKernel<matrix_size>::configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv);
-
-    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-
-    _input  = input;
-    _output = output;
-
-    std::copy_n(conv, _convolution.size(), _convolution.begin());
-
-    if(scale == 0)
-    {
-        _scale = calculate_matrix_scale(_convolution.data(), matrix_size);
-    }
-    else
-    {
-        _scale = scale;
-    }
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, matrix_size),
-                              output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
-
-template <>
-template <typename OutputType>
-void NEConvolutionKernel<3>::convolution(const Window &win)
-{
-    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
-    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
-
-    Iterator input(_input, win);
-    Iterator output(_output, win);
-
-    // Load the matrix's coefficients into Neon registers:
-    const int16x4_t   mat00     = vld1_dup_s16(_convolution.data());
-    const int16x4_t   mat01     = vld1_dup_s16(_convolution.data() + 1);
-    const int16x4_t   mat02     = vld1_dup_s16(_convolution.data() + 2);
-    const int16x4_t   mat10     = vld1_dup_s16(_convolution.data() + 3);
-    const int16x4_t   mat11     = vld1_dup_s16(_convolution.data() + 4);
-    const int16x4_t   mat12     = vld1_dup_s16(_convolution.data() + 5);
-    const int16x4_t   mat20     = vld1_dup_s16(_convolution.data() + 6);
-    const int16x4_t   mat21     = vld1_dup_s16(_convolution.data() + 7);
-    const int16x4_t   mat22     = vld1_dup_s16(_convolution.data() + 8);
-    const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
-
-    const unsigned char *input_top_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, -1));
-    const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 0));
-    const unsigned char *input_low_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 1));
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int32x4_t out  = vdupq_n_s32(0);
-        int32x4_t out2 = vdupq_n_s32(0);
-
-        // Load 16 bytes from the top row:
-        const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
-        convolve_row3x1_unrolled(out, out2, top_data, mat00, mat01, mat02);
-
-        // Load 16 bytes from the middle row:
-        const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
-        convolve_row3x1_unrolled(out, out2, mid_data, mat10, mat11, mat12);
-
-        // Load 16 bytes from the middle row:
-        const uint8x16_t low_data = vld1q_u8(input_low_ptr + input.offset());
-        convolve_row3x1_unrolled(out, out2, low_data, mat20, mat21, mat22);
-
-        // Apply scale
-        if(_scale != 1)
-        {
-            // Convert to F32, scale and convert back to S32
-            out  = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
-            out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
-        }
-
-        // Clamp and store as U8 or S16:
-        store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
-    },
-    input, output);
-}
-
-template <>
-template <typename OutputType>
-void NEConvolutionKernel<5>::convolution(const Window &win)
-{
-    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
-    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
-
-    Iterator input(_input, win);
-    Iterator output(_output, win);
-
-    const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
-
-    const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, -2));
-    const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, -1));
-    const unsigned char *input_mid_ptr  = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 0));
-    const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 1));
-    const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 2));
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int32x4_t out  = vdupq_n_s32(0);
-        int32x4_t out2 = vdupq_n_s32(0);
-
-        // Load 16 bytes from the top2 row:
-        const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
-        convolve_row5x1(out, out2, data_t2, _convolution.data());
-
-        // Load 16 bytes from the top1 row:
-        const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
-        convolve_row5x1(out, out2, data_t1, _convolution.data() + 5);
-
-        // Load 16 bytes from the middle row:
-        const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
-        convolve_row5x1(out, out2, data_m, _convolution.data() + 10);
-
-        // Load 16 bytes from the low1 row:
-        const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
-        convolve_row5x1(out, out2, data_b1, _convolution.data() + 15);
-
-        // Load 16 bytes from the low2 row:
-        const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
-        convolve_row5x1(out, out2, data_b2, _convolution.data() + 20);
-
-        // Apply scale
-        if(_scale != 1)
-        {
-            // Convert to F32, scale and convert back to S32
-            out  = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
-            out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
-        }
-
-        // Clamp and store as U8 or S16:
-        store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
-    },
-    input, output);
-}
-
-template <>
-template <typename OutputType>
-void NEConvolutionKernel<7>::convolution(const Window &win)
-{
-    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
-    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
-
-    Iterator input(_input, win);
-    Iterator output(_output, win);
-
-    const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
-
-    const unsigned char *input_top3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -3));
-    const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -2));
-    const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -1));
-    const unsigned char *input_mid_ptr  = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 0));
-    const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 1));
-    const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 2));
-    const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 3));
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int32x4_t out  = vdupq_n_s32(0);
-        int32x4_t out2 = vdupq_n_s32(0);
-
-        // Load 16 bytes from the top3 row:
-        const uint8x16_t data_t3 = vld1q_u8(input_top3_ptr + input.offset());
-        convolve_row7x1(out, out2, data_t3, _convolution.data());
-
-        // Load 16 bytes from the top2 row:
-        const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
-        convolve_row7x1(out, out2, data_t2, _convolution.data() + 7);
-
-        // Load 16 bytes from the top1 row:
-        const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
-        convolve_row7x1(out, out2, data_t1, _convolution.data() + 14);
-
-        // Load 16 bytes from the middle row:
-        const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
-        convolve_row7x1(out, out2, data_m, _convolution.data() + 21);
-
-        // Load 16 bytes from the low1 row:
-        const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
-        convolve_row7x1(out, out2, data_b1, _convolution.data() + 28);
-
-        // Load 16 bytes from the low2 row:
-        const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
-        convolve_row7x1(out, out2, data_b2, _convolution.data() + 35);
-
-        // Load 16 bytes from the low3 row:
-        const uint8x16_t data_b3 = vld1q_u8(input_low3_ptr + input.offset());
-        convolve_row7x1(out, out2, data_b3, _convolution.data() + 42);
-
-        // Apply scale
-        if(_scale != 1)
-        {
-            // Convert to F32, scale and convert back to S32
-            out  = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
-            out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
-        }
-
-        // Clamp and store as U8 or S16:
-        store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
-    },
-    input, output);
-}
-
-template <>
-template <typename OutputType>
-void NEConvolutionKernel<9>::convolution(const Window &win)
-{
-    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
-    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
-
-    Iterator input(_input, win);
-    Iterator output(_output, win);
-
-    const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
-
-    const unsigned char *input_top4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -4));
-    const unsigned char *input_top3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -3));
-    const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -2));
-    const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -1));
-    const unsigned char *input_mid_ptr  = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 0));
-    const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 1));
-    const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 2));
-    const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 3));
-    const unsigned char *input_low4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 4));
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int32x4_t out  = vdupq_n_s32(0);
-        int32x4_t out2 = vdupq_n_s32(0);
-
-        // Load 16 bytes from the top4 row:
-        const uint8x16_t data_t4 = vld1q_u8(input_top4_ptr + input.offset());
-        convolve_row9x1(out, out2, data_t4, _convolution.data());
-
-        // Load 16 bytes from the top3 row:
-        const uint8x16_t data_t3 = vld1q_u8(input_top3_ptr + input.offset());
-        convolve_row9x1(out, out2, data_t3, _convolution.data() + 9);
-
-        // Load 16 bytes from the top2 row:
-        const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
-        convolve_row9x1(out, out2, data_t2, _convolution.data() + 18);
-
-        // Load 16 bytes from the top1 row:
-        const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
-        convolve_row9x1(out, out2, data_t1, _convolution.data() + 27);
-
-        // Load 16 bytes from the middle row:
-        const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
-        convolve_row9x1(out, out2, data_m, _convolution.data() + 36);
-
-        // Load 16 bytes from the low1 row:
-        const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
-        convolve_row9x1(out, out2, data_b1, _convolution.data() + 45);
-
-        // Load 16 bytes from the low2 row:
-        const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
-        convolve_row9x1(out, out2, data_b2, _convolution.data() + 54);
-
-        // Load 16 bytes from the low3 row:
-        const uint8x16_t data_b3 = vld1q_u8(input_low3_ptr + input.offset());
-        convolve_row9x1(out, out2, data_b3, _convolution.data() + 63);
-
-        // Load 16 bytes from the low4 row:
-        const uint8x16_t data_b4 = vld1q_u8(input_low4_ptr + input.offset());
-        convolve_row9x1(out, out2, data_b4, _convolution.data() + 72);
-
-        // Apply scale
-        if(_scale != 1)
-        {
-            // Convert to F32, scale and convert back to S32
-            out  = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
-            out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
-        }
-
-        // Clamp and store as U8 or S16:
-        store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
-    },
-    input, output);
-}
-
-template <unsigned int matrix_size>
-void NEConvolutionKernel<matrix_size>::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    switch(_output->info()->data_type())
-    {
-        case DataType::U8:
-            convolution<uint8_t>(window);
-            break;
-        case DataType::S16:
-            convolution<int16_t>(window);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not supported Data type!");
-            break;
-    }
-}
-
-template class arm_compute::NEConvolutionKernel<3>;
-template class arm_compute::NEConvolutionKernel<5>;
-template class arm_compute::NEConvolutionKernel<7>;
-template class arm_compute::NEConvolutionKernel<9>;
-
-/****************************************************************************************\
- *                              Separable Square Convolution                            *
-\****************************************************************************************/
-
-template <unsigned int matrix_size>
-NESeparableConvolutionHorKernel<matrix_size>::NESeparableConvolutionHorKernel()
-    : _conv_row{ { 0 } }, _border_size(0)
-{
-}
-
-template <unsigned int matrix_size>
-BorderSize             NESeparableConvolutionHorKernel<matrix_size>::border_size() const
-{
-    return _border_size;
-}
-
-template <unsigned int matrix_size>
-void NESeparableConvolutionHorKernel<matrix_size>::configure(const ITensor *input, ITensor *output, const int16_t *conv_row, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv_row);
-
-    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::S16, DataType::S32);
-
-    _input  = input;
-    _output = output;
-    std::copy_n(conv_row, _conv_row.size(), _conv_row.begin());
-    _border_size = BorderSize(border_undefined ? 0 : matrix_size / 2, matrix_size / 2);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-
-    Window                 win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration),
-                              output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
-
-template <unsigned int matrix_size>
-void NESeparableConvolutionHorKernel<matrix_size>::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    switch(_output->info()->data_type())
-    {
-        case DataType::U16:
-            convolve<uint16_t>(window);
-            break;
-        case DataType::S16:
-            convolve<int16_t>(window);
-            break;
-        case DataType::S32:
-            convolve<int32_t>(window);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported intermediate data type!");
-            break;
-    }
-}
-
-template <>
-template <>
-inline void NESeparableConvolutionHorKernel<5>::convolve<uint16_t>(const Window &window)
-{
-    Window win_in(window);
-    win_in.shift(Window::DimX, -2);
-
-    Iterator input(_input, win_in);
-    Iterator output(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t data = vld1q_u8(input.ptr());
-
-        const uint16x8x2_t data_u16 =
-        {
-            {
-                vmovl_u8(vget_low_u8(data)),
-                vmovl_u8(vget_high_u8(data))
-            }
-        };
-
-        uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
-
-        vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
-    },
-    input, output);
-}
-
-template <>
-template <>
-inline void NESeparableConvolutionHorKernel<5>::convolve<int16_t>(const Window &window)
-{
-    Window win_in(window);
-    win_in.shift(Window::DimX, -2);
-
-    Iterator input(_input, win_in);
-    Iterator output(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t data = vld1q_u8(input.ptr());
-
-        const int16x8x2_t data_s16 =
-        {
-            {
-                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
-                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
-            }
-        };
-
-        int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
-
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
-    },
-    input, output);
-}
-
-template <>
-template <>
-void NESeparableConvolutionHorKernel<5>::convolve<int32_t>(const Window &window)
-{
-    Window win_in(window);
-    win_in.shift(Window::DimX, -2);
-
-    Iterator input(_input, win_in);
-    Iterator output(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t data = vld1q_u8(input.ptr());
-
-        const int16x8x2_t data_s16 =
-        {
-            {
-                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
-                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
-            }
-        };
-
-        const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
-        const int16x8_t data_s16_m  = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
-        const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
-        const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
-
-        int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[1]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[2]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[3]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[4]);
-
-        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
-
-        int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[1]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[2]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[3]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[4]);
-
-        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
-    },
-    input, output);
-}
-
-template <>
-template <>
-inline void NESeparableConvolutionHorKernel<7>::convolve<uint16_t>(const Window &window)
-{
-    Window win_in(window);
-    win_in.shift(Window::DimX, -3);
-
-    Iterator input(_input, win_in);
-    Iterator output(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t data = vld1q_u8(input.ptr());
-
-        const uint16x8x2_t data_u16 =
-        {
-            {
-                vmovl_u8(vget_low_u8(data)),
-                vmovl_u8(vget_high_u8(data))
-            }
-        };
-
-        uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 5), _conv_row[5]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 6), _conv_row[6]);
-
-        vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
-    },
-    input, output);
-}
-
-template <>
-template <>
-inline void NESeparableConvolutionHorKernel<7>::convolve<int16_t>(const Window &window)
-{
-    Window win_in(window);
-    win_in.shift(Window::DimX, -3);
-
-    Iterator input(_input, win_in);
-    Iterator output(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t data = vld1q_u8(input.ptr());
-
-        const int16x8x2_t data_s16 =
-        {
-            {
-                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
-                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
-            }
-        };
-
-        int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 5), _conv_row[5]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 6), _conv_row[6]);
-
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
-    },
-    input, output);
-}
-
-template <>
-template <>
-void NESeparableConvolutionHorKernel<7>::convolve<int32_t>(const Window &window)
-{
-    Window win_in(window);
-    win_in.shift(Window::DimX, -3);
-
-    Iterator input(_input, win_in);
-    Iterator output(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t data = vld1q_u8(input.ptr());
-
-        const int16x8x2_t data_s16 =
-        {
-            {
-                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
-                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
-            }
-        };
-
-        const int16x8_t data_s16_l2 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
-        const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
-        const int16x8_t data_s16_m  = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
-        const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
-        const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 5);
-        const int16x8_t data_s16_r3 = vextq_s16(data_s16.val[0], data_s16.val[1], 6);
-
-        int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_l2), _conv_row[1]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[2]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[3]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[4]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[5]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r3), _conv_row[6]);
-
-        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
-
-        int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_l2), _conv_row[1]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[2]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[3]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[4]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[5]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r3), _conv_row[6]);
-
-        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
-    },
-    input, output);
-}
-
-template <>
-template <>
-inline void NESeparableConvolutionHorKernel<9>::convolve<uint16_t>(const Window &window)
-{
-    Window win_in(window);
-    win_in.shift(Window::DimX, -4);
-
-    Iterator input(_input, win_in);
-    Iterator output(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t data = vld1q_u8(input.ptr());
-
-        const uint16x8x2_t data_u16 =
-        {
-            {
-                vmovl_u8(vget_low_u8(data)),
-                vmovl_u8(vget_high_u8(data))
-            }
-        };
-
-        uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 5), _conv_row[5]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 6), _conv_row[6]);
-        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 7), _conv_row[7]);
-        out            = vmlaq_n_u16(out, data_u16.val[1], _conv_row[8]);
-
-        vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
-    },
-    input, output);
-}
-
-template <>
-template <>
-inline void NESeparableConvolutionHorKernel<9>::convolve<int16_t>(const Window &window)
-{
-    Window win_in(window);
-    win_in.shift(Window::DimX, -4);
-
-    Iterator input(_input, win_in);
-    Iterator output(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t data = vld1q_u8(input.ptr());
-
-        const int16x8x2_t data_s16 =
-        {
-            {
-                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
-                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
-            }
-        };
-
-        int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 5), _conv_row[5]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 6), _conv_row[6]);
-        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 7), _conv_row[7]);
-        out           = vmlaq_n_s16(out, data_s16.val[1], _conv_row[8]);
-
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
-    },
-    input, output);
-}
-
-template <>
-template <>
-void NESeparableConvolutionHorKernel<9>::convolve<int32_t>(const Window &window)
-{
-    Window win_in(window);
-    win_in.shift(Window::DimX, -4);
-
-    Iterator input(_input, win_in);
-    Iterator output(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t data = vld1q_u8(input.ptr());
-
-        const int16x8x2_t data_s16 =
-        {
-            {
-                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
-                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
-            }
-        };
-
-        const int16x8_t data_s16_l3 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
-        const int16x8_t data_s16_l2 = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
-        const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
-        const int16x8_t data_s16_m  = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
-        const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 5);
-        const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 6);
-        const int16x8_t data_s16_r3 = vextq_s16(data_s16.val[0], data_s16.val[1], 7);
-
-        int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_l3), _conv_row[1]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_l2), _conv_row[2]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[3]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[4]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[5]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[6]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r3), _conv_row[7]);
-        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16.val[1]), _conv_row[8]);
-
-        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
-
-        int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_l3), _conv_row[1]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_l2), _conv_row[2]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[3]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[4]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[5]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[6]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r3), _conv_row[7]);
-        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16.val[1]), _conv_row[8]);
-
-        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
-    },
-    input, output);
-}
-
-template class arm_compute::NESeparableConvolutionHorKernel<5>;
-template class arm_compute::NESeparableConvolutionHorKernel<7>;
-template class arm_compute::NESeparableConvolutionHorKernel<9>;
-
-template <unsigned int matrix_size>
-NESeparableConvolutionVertKernel<matrix_size>::NESeparableConvolutionVertKernel()
-    : _conv_col{ { 0 } }, _scale(0)
-{
-}
-
-template <unsigned int matrix_size>
-BorderSize             NESeparableConvolutionVertKernel<matrix_size>::border_size() const
-{
-    return BorderSize{ matrix_size / 2, 0 };
-}
-
-template <unsigned int matrix_size>
-void NESeparableConvolutionVertKernel<matrix_size>::configure(const ITensor *input, ITensor *output, const int16_t *conv_col, uint32_t scale, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv_col);
-
-    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON(scale == 0);
-
-    _input  = input;
-    _output = output;
-    std::copy_n(conv_col, _conv_col.size(), _conv_col.begin());
-    _scale = scale;
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 16;
-
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input->info(), 0, -border_size().top, num_elems_read_per_iteration, matrix_size),
-                              output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
-
-template <unsigned int matrix_size>
-void NESeparableConvolutionVertKernel<matrix_size>::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    switch(_input->info()->data_type())
-    {
-        case DataType::U16:
-            switch(_output->info()->data_type())
-            {
-                case DataType::U8:
-                    convolution_u16<uint8_t>(window);
-                    break;
-                case DataType::S16:
-                    convolution_u16<int16_t>(window);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-            break;
-        case DataType::S16:
-            switch(_output->info()->data_type())
-            {
-                case DataType::U8:
-                    convolution_s16<uint8_t>(window);
-                    break;
-                case DataType::S16:
-                    convolution_s16<int16_t>(window);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-            break;
-        case DataType::S32:
-            switch(_output->info()->data_type())
-            {
-                case DataType::U8:
-                    convolution_s32<uint8_t>(window);
-                    break;
-                case DataType::S16:
-                    convolution_s32<int16_t>(window);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported intermediate data type!");
-            break;
-    }
-}
-
-template <unsigned int matrix_size>
-template <typename OutputType>
-void NESeparableConvolutionVertKernel<matrix_size>::convolution_u16(const Window &win)
-{
-    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
-
-    Window win_in(win);
-    win_in.set_dimension_step(Window::DimX, 8);
-
-    Iterator in(_input, win_in);
-    Iterator out(_output, win);
-
-    std::array<unsigned char *, matrix_size> input_ptrs{ {} };
-    const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
-    const int         k_half       = matrix_size / 2;
-
-    // Set row pointers
-    for(int i = -k_half; i <= k_half; ++i)
-    {
-        input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
-    }
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        uint16x8_t out0 = vdupq_n_u16(0);
-        uint16x8_t out1 = vdupq_n_u16(0);
-
-        // First half
-        for(unsigned int r = 0; r < matrix_size; ++r)
-        {
-            const uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(input_ptrs[r] + in.offset()));
-            out0                  = vmlaq_n_u16(out0, data, _conv_col[r]);
-        }
-
-        in.increment(Window::DimX);
-
-        // Second half
-        for(unsigned int r = 0; r < matrix_size; ++r)
-        {
-            const uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(input_ptrs[r] + in.offset()));
-            out1                  = vmlaq_n_u16(out1, data, _conv_col[r]);
-        }
-
-        //scale the result if needed
-        if(_scale != 1)
-        {
-            float32x4_t out0_f32_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(out0)));
-            float32x4_t out0_f32_low  = vcvtq_f32_u32(vmovl_u16(vget_low_u16(out0)));
-            out0_f32_high             = vmulq_f32(out0_f32_high, oneoverscale);
-            out0_f32_low              = vmulq_f32(out0_f32_low, oneoverscale);
-            store_results(vcvtq_u32_f32(out0_f32_low), vcvtq_u32_f32(out0_f32_high), reinterpret_cast<OutputType *>(out.ptr()));
-
-            float32x4_t out1_f32_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(out1)));
-            float32x4_t out1_f32_low  = vcvtq_f32_u32(vmovl_u16(vget_low_u16(out1)));
-            out1_f32_high             = vmulq_f32(out1_f32_high, oneoverscale);
-            out1_f32_low              = vmulq_f32(out1_f32_low, oneoverscale);
-            store_results(vcvtq_u32_f32(out1_f32_low), vcvtq_u32_f32(out1_f32_high), reinterpret_cast<OutputType *>(out.ptr()) + 8);
-        }
-        else
-        {
-            store_results(out0, out1, reinterpret_cast<OutputType *>(out.ptr()));
-        }
-    },
-    in, out);
-}
-
-template <unsigned int matrix_size>
-template <typename OutputType>
-void NESeparableConvolutionVertKernel<matrix_size>::convolution_s16(const Window &win)
-{
-    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
-
-    Window win_in(win);
-    win_in.set_dimension_step(Window::DimX, 8);
-
-    Iterator in(_input, win_in);
-    Iterator out(_output, win);
-
-    std::array<unsigned char *, matrix_size> input_ptrs{ {} };
-    const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
-    const int         k_half       = matrix_size / 2;
-
-    // Set row pointers
-    for(int i = -k_half; i <= k_half; ++i)
-    {
-        input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
-    }
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int16x8_t out0 = vdupq_n_s16(0);
-        int16x8_t out1 = vdupq_n_s16(0);
-
-        // First half
-        for(unsigned int r = 0; r < matrix_size; ++r)
-        {
-            const int16x8_t data = vld1q_s16(reinterpret_cast<const int16_t *>(input_ptrs[r] + in.offset()));
-            out0                 = vmlaq_n_s16(out0, data, _conv_col[r]);
-        }
-
-        in.increment(Window::DimX);
-
-        // Second half
-        for(unsigned int r = 0; r < matrix_size; ++r)
-        {
-            const int16x8_t data = vld1q_s16(reinterpret_cast<const int16_t *>(input_ptrs[r] + in.offset()));
-            out1                 = vmlaq_n_s16(out1, data, _conv_col[r]);
-        }
-
-        //scale the result if needed
-        if(_scale != 1)
-        {
-            float32x4_t out0_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out0)));
-            float32x4_t out0_f32_low  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out0)));
-            out0_f32_high             = vmulq_f32(out0_f32_high, oneoverscale);
-            out0_f32_low              = vmulq_f32(out0_f32_low, oneoverscale);
-            store_results(vcvtq_s32_f32(out0_f32_low), vcvtq_s32_f32(out0_f32_high), reinterpret_cast<OutputType *>(out.ptr()));
-
-            float32x4_t out1_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out1)));
-            float32x4_t out1_f32_low  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out1)));
-            out1_f32_high             = vmulq_f32(out1_f32_high, oneoverscale);
-            out1_f32_low              = vmulq_f32(out1_f32_low, oneoverscale);
-            store_results(vcvtq_s32_f32(out1_f32_low), vcvtq_s32_f32(out1_f32_high), reinterpret_cast<OutputType *>(out.ptr()) + 8);
-        }
-        else
-        {
-            store_results(out0, out1, reinterpret_cast<OutputType *>(out.ptr()));
-        }
-    },
-    in, out);
-}
-
-template <unsigned int matrix_size>
-template <typename OutputType>
-void NESeparableConvolutionVertKernel<matrix_size>::convolution_s32(const Window &win)
-{
-    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
-
-    Window win_in(win);
-    win_in.set_dimension_step(Window::DimX, 8);
-
-    Iterator in(_input, win_in);
-    Iterator out(_output, win);
-
-    std::array<unsigned char *, matrix_size> input_ptrs{ {} };
-    const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
-    const int         k_half       = matrix_size / 2;
-
-    // Set row pointers
-    for(int i = -k_half; i <= k_half; ++i)
-    {
-        input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
-    }
-
-    const int32x4_t zero = vdupq_n_s32(0);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int32x4x2_t out0 =
-        {
-            {
-                zero,
-                zero
-            }
-        };
-
-        int32x4x2_t out1 =
-        {
-            {
-                zero,
-                zero
-            }
-        };
-
-        // First half
-        for(unsigned int r = 0; r < matrix_size; ++r)
-        {
-            const int32x4x2_t data = vld2q_s32(reinterpret_cast<const int32_t *>(input_ptrs[r] + in.offset()));
-            out0.val[0]            = vmlaq_n_s32(out0.val[0], data.val[0], _conv_col[r]);
-            out0.val[1]            = vmlaq_n_s32(out0.val[1], data.val[1], _conv_col[r]);
-        }
-
-        in.increment(Window::DimX);
-
-        // Second half
-        for(unsigned int r = 0; r < matrix_size; ++r)
-        {
-            const int32x4x2_t data = vld2q_s32(reinterpret_cast<const int32_t *>(input_ptrs[r] + in.offset()));
-            out1.val[0]            = vmlaq_n_s32(out1.val[0], data.val[0], _conv_col[r]);
-            out1.val[1]            = vmlaq_n_s32(out1.val[1], data.val[1], _conv_col[r]);
-        }
-
-        //scale the result if needed
-        if(_scale != 1)
-        {
-            float32x4_t out0_f32_odd  = vcvtq_f32_s32(out0.val[0]);
-            float32x4_t out0_f32_even = vcvtq_f32_s32(out0.val[1]);
-            out0_f32_odd              = vmulq_f32(out0_f32_odd, oneoverscale);
-            out0_f32_even             = vmulq_f32(out0_f32_even, oneoverscale);
-            out0.val[0]               = vcvtq_s32_f32(out0_f32_odd);
-            out0.val[1]               = vcvtq_s32_f32(out0_f32_even);
-
-            float32x4_t out1_f32_odd  = vcvtq_f32_s32(out1.val[0]);
-            float32x4_t out1_f32_even = vcvtq_f32_s32(out1.val[1]);
-            out1_f32_odd              = vmulq_f32(out1_f32_odd, oneoverscale);
-            out1_f32_even             = vmulq_f32(out1_f32_even, oneoverscale);
-            out1.val[0]               = vcvtq_s32_f32(out1_f32_odd);
-            out1.val[1]               = vcvtq_s32_f32(out1_f32_even);
-        }
-
-        const int32x4x2_t out0_s32 = vzipq_s32(out0.val[0], out0.val[1]);
-        store_results(out0_s32.val[0], out0_s32.val[1], reinterpret_cast<OutputType *>(out.ptr()));
-
-        const int32x4x2_t out1_s32 = vzipq_s32(out1.val[0], out1.val[1]);
-        store_results(out1_s32.val[0], out1_s32.val[1], reinterpret_cast<OutputType *>(out.ptr()) + 8);
-    },
-    in, out);
-}
-
-template class arm_compute::NESeparableConvolutionVertKernel<5>;
-template class arm_compute::NESeparableConvolutionVertKernel<7>;
-template class arm_compute::NESeparableConvolutionVertKernel<9>;
-
-/****************************************************************************************\
- *                                 Rectangle Convolution                                *
-\****************************************************************************************/
-
-NEConvolutionRectangleKernel::NEConvolutionRectangleKernel()
-    : _input(nullptr), _output(nullptr), _scale(0), _convolution(), _border_size(), _func_idx(0)
-{
-}
-
-BorderSize NEConvolutionRectangleKernel::border_size() const
-{
-    return _border_size;
-}
-
-void NEConvolutionRectangleKernel::configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv);
-
-    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON(width != 3 && width != 5 && width != 7 && width != 9);
-    ARM_COMPUTE_ERROR_ON(height != 3 && height != 5 && height != 7 && height != 9);
-    ARM_COMPUTE_ERROR_ON(0 == scale);
-
-    _input       = input;
-    _output      = output;
-    _scale       = scale;
-    _border_size = BorderSize(height / 2, width / 2);
-
-    // Setup the convolution matrix
-    const uint32_t nr_elements = width * height;
-    _convolution.resize(nr_elements);
-    std::copy_n(conv, nr_elements, _convolution.begin());
-
-    // Set function index to help choose appropriate function in run()
-    _func_idx = get_index(height) * 4 + get_index(width);
-    ARM_COMPUTE_ERROR_ON(_func_idx > (_nr_supported_sizes * _nr_supported_sizes));
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, _border_size);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, height),
-                              output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, _border_size);
-
-    INEKernel::configure(win);
-}
-
-void NEConvolutionRectangleKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    using ConvolutionRectangleFunction = void (NEConvolutionRectangleKernel::*)(const Window & window);
-
-    // uint8_t function table
-    static const std::array<ConvolutionRectangleFunction, 16> func_table_u8 =
-    {
-        {
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 3>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 5>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 7>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 9>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 3>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 5>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 7>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 9>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 3>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 5>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 7>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 9>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 3>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 5>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 7>,
-            &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 9>
-        }
-    };
-    // int16_t function table
-    static const std::array<ConvolutionRectangleFunction, 16> func_table_s16 =
-    {
-        {
-            &NEConvolutionRectangleKernel::convolution<int16_t, 3, 3>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 3, 5>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 3, 7>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 3, 9>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 5, 3>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 5, 5>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 5, 7>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 5, 9>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 7, 3>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 7, 5>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 7, 7>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 7, 9>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 9, 3>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 9, 5>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 9, 7>,
-            &NEConvolutionRectangleKernel::convolution<int16_t, 9, 9>
-        }
-    };
-
-    // Run appropriate function
-    switch(_output->info()->data_type())
-    {
-        case DataType::U8:
-            ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_u8.size());
-            (this->*func_table_u8[_func_idx])(window);
-            break;
-        case DataType::S16:
-            ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_s16.size());
-            (this->*func_table_s16[_func_idx])(window);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-    }
-}
-
-unsigned int NEConvolutionRectangleKernel::get_index(uint32_t val)
-{
-    switch(val)
-    {
-        case 3:
-            return 0;
-        case 5:
-            return 1;
-        case 7:
-            return 2;
-        case 9:
-            return 3;
-        default:
-            ARM_COMPUTE_ERROR("Not supported dimension size");
-            return 0;
-    }
-}
-
-template <typename OutputType, unsigned int rows, unsigned int cols>
-void NEConvolutionRectangleKernel::convolution(const Window &win)
-{
-    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
-    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
-
-    Iterator input(_input, win);
-    Iterator output(_output, win);
-
-    std::array<unsigned char *, rows> input_ptrs{ {} };
-    const int16_t    *conv       = _convolution.data();
-    const float32x4_t scale_val  = vdupq_n_f32(1.0f / _scale);
-    const int         k_row_half = rows / 2;
-    const int         k_col_half = cols / 2;
-
-    // Set row pointers
-    for(int i = -k_row_half; i <= k_row_half; ++i)
-    {
-        input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
-    }
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int32x4_t out  = vdupq_n_s32(0);
-        int32x4_t out2 = vdupq_n_s32(0);
-
-        // Perform appropriate convolution
-        for(unsigned int r = 0; r < rows; ++r)
-        {
-            const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset());
-            if(3 == cols)
-            {
-                convolve_row3x1(out, out2, data, conv + r * cols);
-            }
-            else if(5 == cols)
-            {
-                convolve_row5x1(out, out2, data, conv + r * cols);
-            }
-            else if(7 == cols)
-            {
-                convolve_row7x1(out, out2, data, conv + r * cols);
-            }
-            else if(9 == cols)
-            {
-                convolve_row9x1(out, out2, data, conv + r * cols);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported number of columns");
-            }
-        }
-
-        // Apply scale
-        if(_scale != 1)
-        {
-            // Convert to F32, scale and convert back to S32
-            out  = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
-            out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
-        }
-
-        // Clamp and store as U8 or S16:
-        store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
-    },
-    input, output);
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEConvolutionKernel.h b/src/core/NEON/kernels/NEConvolutionKernel.h
deleted file mode 100644
index b8bf1d1..0000000
--- a/src/core/NEON/kernels/NEConvolutionKernel.h
+++ /dev/null
@@ -1,299 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECONVOLUTIONKERNEL_H
-#define ARM_COMPUTE_NECONVOLUTIONKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/NEON/INESimpleKernel.h"
-
-#include <array>
-#include <cstdint>
-#include <vector>
-
-namespace arm_compute
-{
-class ITensor;
-
-/****************************************************************************************\
- *                                    Square Convolution                                *
-\****************************************************************************************/
-
-/** Interface for the kernel to run an arbitrary size convolution on a tensor. (Currently supports 3x3, 5x5, 7x7 and 9x9).
- * The client can supply a convolution matrix \f$ C_{m,n} \f$.
- * @f{eqnarray}{
- *  k_0 &=& \frac{m}{2}  \\
- *  l_0 &=& \frac{n}{2}  \\
- *  sum &=& \sum_{k=0,l=0}^{k=m-1,l=n-1} input(x+k-k_0, y+l-l_0) C_{k,l}
- *  @f}
- *
- * @note The above equation for this function is similar to the default OpenCV Filter2D function,
- *       which actually computes a correlation and not a convolution.
- *       In case of a real convolution the convolution matrix should be flipped both horizontally and vertically.
- */
-template <unsigned int matrix_size>
-class NEConvolutionKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEConvolutionKernel";
-    }
-    /** Default constructor */
-    NEConvolutionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NEConvolutionKernel(const NEConvolutionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NEConvolutionKernel &operator=(const NEConvolutionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEConvolutionKernel(NEConvolutionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEConvolutionKernel &operator=(NEConvolutionKernel &&) = default;
-    /** Default destructor */
-    ~NEConvolutionKernel() = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output           Destination tensor. Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    template <typename OutputType>
-    void convolution(const Window &win);
-
-protected:
-    uint32_t _scale;                                             /**< scale of the convolution */
-    std::array<int16_t, matrix_size *matrix_size> _convolution;  /**< convolution matrix */
-};
-
-/** Interface for the kernel which applied a 3x3 convolution to a tensor.*/
-using NEConvolution3x3Kernel = NEConvolutionKernel<3>;
-/** Interface for the kernel which applied a 5x5 convolution to a tensor.*/
-using NEConvolution5x5Kernel = NEConvolutionKernel<5>;
-/** Interface for the kernel which applied a 7x7 convolution to a tensor.*/
-using NEConvolution7x7Kernel = NEConvolutionKernel<7>;
-///** Interface for the kernel which applied a 9x9 convolution to a tensor.*/
-using NEConvolution9x9Kernel = NEConvolutionKernel<9>;
-
-/****************************************************************************************\
- *                              Separable Square Convolution                            *
-\****************************************************************************************/
-
-/** Kernel for the Horizontal pass of a Separable Convolution */
-template <unsigned int matrix_size>
-class NESeparableConvolutionHorKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESeparableConvolutionHorKernel";
-    }
-    /** Default constructor */
-    NESeparableConvolutionHorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NESeparableConvolutionHorKernel(const NESeparableConvolutionHorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NESeparableConvolutionHorKernel &operator=(const NESeparableConvolutionHorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NESeparableConvolutionHorKernel(NESeparableConvolutionHorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NESeparableConvolutionHorKernel &operator=(NESeparableConvolutionHorKernel &&) = default;
-    /** Default destructor */
-    ~NESeparableConvolutionHorKernel() = default;
-
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output           Destination tensor. Data types supported: U16, S16, S32.
-     * @param[in]  conv_row         Convolution matrix to apply to the input tensor.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, const int16_t *conv_row, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Apply the object's convolution to the given window of the input tensor..
-     *
-     * @param[in] window Window to apply the convolution on.
-     */
-    template <typename OutputType>
-    void convolve(const Window &window);
-
-    std::array<int16_t, matrix_size> _conv_row; /**< Convolution coefficients */
-    BorderSize _border_size;                    /**< Border size */
-};
-
-/** Interface for the kernel which applied a 5x1 horizontal convolution to a tensor.*/
-using NESeparableConvolution5x5HorKernel = NESeparableConvolutionHorKernel<5>;
-/** Interface for the kernel which applied a 7x1 horizontal convolution to a tensor.*/
-using NESeparableConvolution7x7HorKernel = NESeparableConvolutionHorKernel<7>;
-/** Interface for the kernel which applied a 9x1 horizontal convolution to a tensor.*/
-using NESeparableConvolution9x9HorKernel = NESeparableConvolutionHorKernel<9>;
-
-/** Kernel for the Vertical pass of a Separable Convolution */
-template <unsigned int matrix_size>
-class NESeparableConvolutionVertKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESeparableConvolutionVertKernel";
-    }
-    /** Default constructor */
-    NESeparableConvolutionVertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NESeparableConvolutionVertKernel(const NESeparableConvolutionVertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NESeparableConvolutionVertKernel &operator=(const NESeparableConvolutionVertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NESeparableConvolutionVertKernel(NESeparableConvolutionVertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NESeparableConvolutionVertKernel &operator=(NESeparableConvolutionVertKernel &&) = default;
-    /** Default destructor */
-    ~NESeparableConvolutionVertKernel() = default;
-
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U16, S16, S32.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv_col         Convolution matrix to apply to the input tensor.
-     * @param[in]  scale            Scale of the convolution matrix
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, const int16_t *conv_col, uint32_t scale, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Apply the object's convolution to the given window of the input tensor.
-     *  This function is used if the intermediate values have been stored as U16.
-     *
-     * @param[in] win Window to apply the convolution on.
-     */
-    template <typename OutputType>
-    void convolution_u16(const Window &win);
-    /** Apply the object's convolution to the given window of the input tensor.
-     *  This function is used if the intermediate values have been stored as S16.
-     *
-     * @param[in] win Window to apply the convolution on.
-     */
-    template <typename OutputType>
-    void convolution_s16(const Window &win);
-    /** Apply the object's convolution to the given window of the input tensor.
-     *  This function is used if the intermediate values have been stored as S32.
-     *
-     * @param[in] win Window to apply the convolution on.
-     */
-    template <typename OutputType>
-    void convolution_s32(const Window &win);
-
-    std::array<int16_t, matrix_size> _conv_col; /**< Convolution coefficients */
-    uint32_t _scale;                            /**< Convolution's scale */
-};
-
-/** Interface for the kernel which applied a 1x5 vertical convolution to a tensor.*/
-using NESeparableConvolution5x5VertKernel = NESeparableConvolutionVertKernel<5>;
-/** Interface for the kernel which applied a 1x7 vertical convolution to a tensor.*/
-using NESeparableConvolution7x7VertKernel = NESeparableConvolutionVertKernel<7>;
-/** Interface for the kernel which applied a 1x9 vertical convolution to a tensor.*/
-using NESeparableConvolution9x9VertKernel = NESeparableConvolutionVertKernel<9>;
-
-/****************************************************************************************\
- *                                 Rectangle Convolution                                *
-\****************************************************************************************/
-
-/** Kernel for the running convolution on a rectangle matrix.
- *
- * @note Supports combinations of 3,5,7 and 9.
- */
-class NEConvolutionRectangleKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEConvolutionRectangleKernel";
-    }
-    /** Default constructor */
-    NEConvolutionRectangleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEConvolutionRectangleKernel(NEConvolutionRectangleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEConvolutionRectangleKernel &operator=(NEConvolutionRectangleKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEConvolutionRectangleKernel(NEConvolutionRectangleKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEConvolutionRectangleKernel &operator=(NEConvolutionRectangleKernel &&) = default;
-    /** Default destructor */
-    ~NEConvolutionRectangleKernel() = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  width            Width of convolution matrix (Number of columns)
-     * @param[in]  height           Height of convolution matrix (Number of rows)
-     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    unsigned int get_index(uint32_t val);
-    /** Apply the object's convolution to the given window of the input tensor.
-     *
-     * @param[in] win Window to apply the convolution on.
-     */
-    template <typename OutputType, unsigned int rows, unsigned int cols>
-    void convolution(const Window &win);
-
-protected:
-    const ITensor            *_input;       /**< Input tensor */
-    ITensor                  *_output;      /**< Output tensor */
-    uint32_t                  _scale;       /**< Scale of the convolution */
-    std::vector<int16_t>      _convolution; /**< Convolution matrix */
-    BorderSize                _border_size; /**< Calculated border width */
-    uint32_t                  _func_idx;    /**< Index used to specify convolution function to be used */
-    const static unsigned int _nr_supported_sizes
-    {
-        4
-    }; /**< Number of supported permutations */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NECONVOLUTIONKERNEL_H */
diff --git a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
deleted file mode 100644
index 9f5dfcd..0000000
--- a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
+++ /dev/null
@@ -1,516 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-
-using namespace arm_compute;
-
-namespace arm_compute
-{
-class Coordinates;
-} // namespace arm_compute
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-namespace fp16
-{
-inline void mask_top(const float16x8_t &vc, const float16x8_t &in0, const float16x8_t &in1, uint16x8_t &mask)
-{
-    // vc > nc.val[0], vc > nc.val[1], vc > nc.val[2]
-    mask = vandq_u16(mask, vcgeq_f16(vc, in0));
-    mask = vandq_u16(mask, vcgeq_f16(vc, vextq_f16(in0, in1, 1)));
-    mask = vandq_u16(mask, vcgeq_f16(vc, vextq_f16(in0, in1, 2)));
-}
-
-inline void mask_middle(const float16x8_t &vc, const float16x8_t &in0, const float16x8_t &in1, uint16x8_t &mask)
-{
-    // vc >= nc.val[0], vc > nc.val[2]
-    mask = vandq_u16(mask, vcgeq_f16(vc, in0));
-    mask = vandq_u16(mask, vcgtq_f16(vc, vextq_f16(in0, in1, 2)));
-}
-
-inline void mask_bottom(const float16x8_t &vc, const float16x8_t &in0, const float16x8_t &in1, uint16x8_t &mask)
-{
-    // vc > nc.val[0], vc > nc.val[1], vc > nc.val[2]
-    mask = vandq_u16(mask, vcgtq_f16(vc, in0));
-    mask = vandq_u16(mask, vcgtq_f16(vc, vextq_f16(in0, in1, 1)));
-    mask = vandq_u16(mask, vcgtq_f16(vc, vextq_f16(in0, in1, 2)));
-}
-
-inline void non_maxima_suppression3x3_F32_F32(const void *__restrict in_ptr, void *__restrict out_ptr, const uint32_t in_stride)
-{
-    auto       in  = static_cast<const float *__restrict>(in_ptr) - 1;
-    const auto out = static_cast<float *__restrict>(out_ptr);
-
-    // Get centre scores
-    const float16x8x2_t vc =
-    {
-        vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 1)), vcvt_f16_f32(vld1q_f32(in + 5))),
-        vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 9)), vcvt_f16_f32(vld1q_f32(in + 13)))
-    };
-
-    // Neighboring pixels
-    in -= in_stride;
-
-    static const float16x4_t  zero_f16x4 = vdup_n_f16(0);
-    static const uint16x8_t   zero_u16   = vdupq_n_u16(0);
-    static const uint16x8_t   true_mask  = vceqq_u16(zero_u16, zero_u16);
-    static const uint16x8x2_t true_mask_x2 =
-    {
-        true_mask,
-        true_mask
-    };
-
-    uint16x8x2_t mask = true_mask_x2;
-
-    // Top row
-    const float16x8_t tmp_top0 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in)), vcvt_f16_f32(vld1q_f32(in + 4)));
-    const float16x8_t tmp_top1 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 8)), vcvt_f16_f32(vld1q_f32(in + 12)));
-    const float16x8_t tmp_top2 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 16)), zero_f16x4);
-
-    // vc >= nc.val[0], vc >= nc.val[1], vc >= nc.val[2]
-    mask_top(vc.val[0], tmp_top0, tmp_top1, mask.val[0]);
-    mask_top(vc.val[1], tmp_top1, tmp_top2, mask.val[1]);
-
-    in += in_stride;
-
-    // Middle row
-    const float16x8_t tmp_mid0 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in)), vcvt_f16_f32(vld1q_f32(in + 4)));
-    const float16x8_t tmp_mid1 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 8)), vcvt_f16_f32(vld1q_f32(in + 12)));
-    const float16x8_t tmp_mid2 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 16)), zero_f16x4);
-
-    // vc >= nc.val[0], vc > nc.val[2]
-    mask_middle(vc.val[0], tmp_mid0, tmp_mid1, mask.val[0]);
-    mask_middle(vc.val[1], tmp_mid1, tmp_mid2, mask.val[1]);
-
-    in += in_stride;
-
-    // Bottom row
-    const float16x8_t tmp_bot0 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in)), vcvt_f16_f32(vld1q_f32(in + 4)));
-    const float16x8_t tmp_bot1 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 8)), vcvt_f16_f32(vld1q_f32(in + 12)));
-    const float16x8_t tmp_bot2 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 16)), zero_f16x4);
-
-    // vc > nc.val[0], vc > nc.val[1], vc > nc.val[2]
-    mask_bottom(vc.val[0], tmp_bot0, tmp_bot1, mask.val[0]);
-    mask_bottom(vc.val[1], tmp_bot1, tmp_bot2, mask.val[1]);
-
-    // Store
-    static const float16x8_t zero_f16x8 = vdupq_n_f16(0);
-
-    const float16x8_t suppressed0 = vbslq_f16(mask.val[0], vc.val[0], zero_f16x8);
-    vst1q_f32(out + 0, vcvt_f32_f16(vget_low_f16(suppressed0)));
-    vst1q_f32(out + 4, vcvt_f32_f16(vget_high_f16(suppressed0)));
-
-    const float16x8_t suppressed1 = vbslq_f16(mask.val[1], vc.val[1], zero_f16x8);
-    vst1q_f32(out + 8, vcvt_f32_f16(vget_low_f16(suppressed1)));
-    vst1q_f32(out + 12, vcvt_f32_f16(vget_high_f16(suppressed1)));
-}
-
-inline void non_maxima_suppression3x3_U8_U8(const void *__restrict in_ptr, void *__restrict out_ptr, const uint32_t in_stride)
-{
-    auto       in  = static_cast<const uint8_t *__restrict>(in_ptr) - 1;
-    const auto out = static_cast<uint8_t *__restrict>(out_ptr);
-
-    // Get centre scores
-    const uint8x16_t vc = vld1q_u8(in + 1);
-
-    // Neighboring pixels
-    in -= in_stride;
-
-    // Top row
-    const uint8x16_t l_nc_0 = vld1q_u8(in);
-    const uint8x16_t m_nc_0 = vld1q_u8(in + 1);
-    const uint8x16_t r_nc_0 = vld1q_u8(in + 2);
-
-    // Keep center scores if ...
-    // vc >= l_nc_0, vc >= m_nc_0, vc >= r_nc_0
-    uint8x16_t mask = vcgeq_u8(vc, l_nc_0);
-    mask            = vandq_u8(mask, vcgeq_u8(vc, m_nc_0));
-    mask            = vandq_u8(mask, vcgeq_u8(vc, r_nc_0));
-
-    in += in_stride;
-
-    // Middle row
-    const uint8x16_t l_nc_1 = vld1q_u8(in);
-    const uint8x16_t r_nc_1 = vld1q_u8(in + 2);
-
-    // ... and ...
-    // vc >= l_nc_1, vc > r_nc_1
-    mask = vandq_u8(mask, vcgeq_u8(vc, l_nc_1));
-    mask = vandq_u8(mask, vcgtq_u8(vc, r_nc_1));
-
-    in += in_stride;
-
-    // Bottom row
-    const uint8x16_t l_nc_2 = vld1q_u8(in);
-    const uint8x16_t m_nc_2 = vld1q_u8(in + 1);
-    const uint8x16_t r_nc_2 = vld1q_u8(in + 2);
-
-    // ... and ...
-    // vc > l_nc_2, vc > m_nc_2, vc > r_nc_2
-    mask = vandq_u8(mask, vcgtq_u8(vc, l_nc_2));
-    mask = vandq_u8(mask, vcgtq_u8(vc, m_nc_2));
-    mask = vandq_u8(mask, vcgtq_u8(vc, r_nc_2));
-
-    // Store
-    static const uint8x16_t zero = vdupq_n_u8(0);
-    vst1q_u8(out, vbslq_u8(mask, vc, zero));
-}
-} // namespace fp16
-
-void NENonMaximaSuppression3x3FP16Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-    _input  = input;
-    _output = output;
-
-    switch(input->info()->data_type())
-    {
-        case DataType::U8:
-            _func = &fp16::non_maxima_suppression3x3_U8_U8;
-            break;
-        default:
-            _func = &fp16::non_maxima_suppression3x3_F32_F32;
-            break;
-    }
-
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-    const unsigned int     num_elems_read_per_iteration      = 16 + 2 * border_size().left + (input->info()->data_type() == DataType::U8 ? 0 : 3);
-    constexpr unsigned int num_elems_written_per_iteration   = 16;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-namespace
-{
-inline void non_maxima_suppression3x3_FLOAT_FLOAT(const void *__restrict input_ptr, void *__restrict output_ptr, const uint32_t input_stride)
-{
-    auto       input  = static_cast<const float *__restrict>(input_ptr) - 1;
-    const auto output = static_cast<float *__restrict>(output_ptr);
-
-    // Get centre scores
-    const float32x4x4_t vc =
-    {
-        {
-            vld1q_f32(input + 1),
-            vld1q_f32(input + 5),
-            vld1q_f32(input + 9),
-            vld1q_f32(input + 13)
-        }
-    };
-
-    // Neighboring pixels
-    float32x4x4_t l_nc{ {} };
-    float32x4x4_t m_nc{ {} };
-    float32x4x4_t r_nc{ {} };
-
-    input -= input_stride;
-
-    // Row0 - Low part
-    float32x4_t tmp_low   = vld1q_f32(input);
-    float32x4_t tmp_high  = vld1q_f32(input + 4);
-    float32x4_t tmp_high1 = vld1q_f32(input + 8);
-
-    l_nc.val[0] = tmp_low;
-    m_nc.val[0] = vextq_f32(tmp_low, tmp_high, 1);
-    r_nc.val[0] = vextq_f32(tmp_low, tmp_high, 2);
-
-    tmp_low  = tmp_high;
-    tmp_high = tmp_high1;
-
-    l_nc.val[1] = tmp_low;
-    m_nc.val[1] = vextq_f32(tmp_low, tmp_high, 1);
-    r_nc.val[1] = vextq_f32(tmp_low, tmp_high, 2);
-
-    // Row0 - High part
-    tmp_low   = tmp_high1;
-    tmp_high  = vld1q_f32(input + 12);
-    tmp_high1 = vld1q_f32(input + 16);
-
-    l_nc.val[2] = tmp_low;
-    m_nc.val[2] = vextq_f32(tmp_low, tmp_high, 1);
-    r_nc.val[2] = vextq_f32(tmp_low, tmp_high, 2);
-
-    tmp_low  = tmp_high;
-    tmp_high = tmp_high1;
-
-    l_nc.val[3] = tmp_low;
-    m_nc.val[3] = vextq_f32(tmp_low, tmp_high, 1);
-    r_nc.val[3] = vextq_f32(tmp_low, tmp_high, 2);
-
-    // mc >= nc.val[0], mc >= nc.val[1], mc >= nc.val[2]
-    uint32x4x4_t mask{ {} };
-    mask.val[0] = vcgeq_f32(vc.val[0], l_nc.val[0]);
-    mask.val[0] = vandq_u32(mask.val[0], vcgeq_f32(vc.val[0], m_nc.val[0]));
-    mask.val[0] = vandq_u32(mask.val[0], vcgeq_f32(vc.val[0], r_nc.val[0]));
-    mask.val[1] = vcgeq_f32(vc.val[1], l_nc.val[1]);
-    mask.val[1] = vandq_u32(mask.val[1], vcgeq_f32(vc.val[1], m_nc.val[1]));
-    mask.val[1] = vandq_u32(mask.val[1], vcgeq_f32(vc.val[1], r_nc.val[1]));
-    mask.val[2] = vcgeq_f32(vc.val[2], l_nc.val[2]);
-    mask.val[2] = vandq_u32(mask.val[2], vcgeq_f32(vc.val[2], m_nc.val[2]));
-    mask.val[2] = vandq_u32(mask.val[2], vcgeq_f32(vc.val[2], r_nc.val[2]));
-    mask.val[3] = vcgeq_f32(vc.val[3], l_nc.val[3]);
-    mask.val[3] = vandq_u32(mask.val[3], vcgeq_f32(vc.val[3], m_nc.val[3]));
-    mask.val[3] = vandq_u32(mask.val[3], vcgeq_f32(vc.val[3], r_nc.val[3]));
-
-    input += input_stride;
-
-    // Row1 - Low part
-    tmp_low   = vld1q_f32(input);
-    tmp_high  = vld1q_f32(input + 4);
-    tmp_high1 = vld1q_f32(input + 8);
-
-    l_nc.val[0] = tmp_low;
-    r_nc.val[0] = vextq_f32(tmp_low, tmp_high, 2);
-
-    tmp_low  = tmp_high;
-    tmp_high = tmp_high1;
-
-    l_nc.val[1] = tmp_low;
-    r_nc.val[1] = vextq_f32(tmp_low, tmp_high, 2);
-
-    // Row1 - High part
-    tmp_low   = tmp_high1;
-    tmp_high  = vld1q_f32(input + 12);
-    tmp_high1 = vld1q_f32(input + 16);
-
-    l_nc.val[2] = tmp_low;
-    r_nc.val[2] = vextq_f32(tmp_low, tmp_high, 2);
-
-    tmp_low  = tmp_high;
-    tmp_high = tmp_high1;
-
-    l_nc.val[3] = tmp_low;
-    r_nc.val[3] = vextq_f32(tmp_low, tmp_high, 2);
-
-    // mc >= nc.val[0], mc > nc.val[2]
-    mask.val[0] = vandq_u32(mask.val[0], vcgeq_f32(vc.val[0], l_nc.val[0]));
-    mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], r_nc.val[0]));
-    mask.val[1] = vandq_u32(mask.val[1], vcgeq_f32(vc.val[1], l_nc.val[1]));
-    mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], r_nc.val[1]));
-    mask.val[2] = vandq_u32(mask.val[2], vcgeq_f32(vc.val[2], l_nc.val[2]));
-    mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], r_nc.val[2]));
-    mask.val[3] = vandq_u32(mask.val[3], vcgeq_f32(vc.val[3], l_nc.val[3]));
-    mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], r_nc.val[3]));
-
-    input += input_stride;
-
-    // Row2 - Low part
-    tmp_low   = vld1q_f32(input);
-    tmp_high  = vld1q_f32(input + 4);
-    tmp_high1 = vld1q_f32(input + 8);
-
-    l_nc.val[0] = tmp_low;
-    m_nc.val[0] = vextq_f32(tmp_low, tmp_high, 1);
-    r_nc.val[0] = vextq_f32(tmp_low, tmp_high, 2);
-
-    tmp_low  = tmp_high;
-    tmp_high = tmp_high1;
-
-    l_nc.val[1] = tmp_low;
-    m_nc.val[1] = vextq_f32(tmp_low, tmp_high, 1);
-    r_nc.val[1] = vextq_f32(tmp_low, tmp_high, 2);
-
-    // Row2 - High part
-    tmp_low   = tmp_high1;
-    tmp_high  = vld1q_f32(input + 12);
-    tmp_high1 = vld1q_f32(input + 16);
-
-    l_nc.val[2] = tmp_low;
-    m_nc.val[2] = vextq_f32(tmp_low, tmp_high, 1);
-    r_nc.val[2] = vextq_f32(tmp_low, tmp_high, 2);
-
-    tmp_low  = tmp_high;
-    tmp_high = tmp_high1;
-
-    l_nc.val[3] = tmp_low;
-    m_nc.val[3] = vextq_f32(tmp_low, tmp_high, 1);
-    r_nc.val[3] = vextq_f32(tmp_low, tmp_high, 2);
-
-    // mc > nc.val[0], mc > nc.val[1], mc > nc.val[2]
-    mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], l_nc.val[0]));
-    mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], m_nc.val[0]));
-    mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], r_nc.val[0]));
-    mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], l_nc.val[1]));
-    mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], m_nc.val[1]));
-    mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], r_nc.val[1]));
-    mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], l_nc.val[2]));
-    mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], m_nc.val[2]));
-    mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], r_nc.val[2]));
-    mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], l_nc.val[3]));
-    mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], m_nc.val[3]));
-    mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], r_nc.val[3]));
-
-    static const float32x4_t zero = vdupq_n_f32(0.f);
-
-    // Store
-    vst1q_f32(output + 0, vbslq_f32(mask.val[0], vc.val[0], zero));
-    vst1q_f32(output + 4, vbslq_f32(mask.val[1], vc.val[1], zero));
-    vst1q_f32(output + 8, vbslq_f32(mask.val[2], vc.val[2], zero));
-    vst1q_f32(output + 12, vbslq_f32(mask.val[3], vc.val[3], zero));
-}
-
-inline void non_maxima_suppression3x3_U8_U8(const void *__restrict input_ptr, void *__restrict output_ptr, const uint32_t input_stride)
-{
-    auto       input  = static_cast<const uint8_t *__restrict>(input_ptr) - 1;
-    const auto output = static_cast<uint8_t *__restrict>(output_ptr);
-
-    // Get centre scores
-    const uint8x16_t vc = vld1q_u8(input + 1);
-
-    // Neighboring pixels
-    uint8x16_t l_nc{};
-    uint8x16_t m_nc{};
-    uint8x16_t r_nc{};
-
-    input -= input_stride;
-
-    // Row0
-    l_nc = vld1q_u8(input);
-    m_nc = vld1q_u8(input + 1);
-    r_nc = vld1q_u8(input + 2);
-
-    // mc >= l_nc, mc >= m_nc, mc >= r_nc
-    uint8x16_t mask = vcgeq_u8(vc, l_nc);
-    mask            = vandq_u8(mask, vcgeq_u8(vc, m_nc));
-    mask            = vandq_u8(mask, vcgeq_u8(vc, r_nc));
-
-    input += input_stride;
-
-    // Row1
-    l_nc = vld1q_u8(input);
-    r_nc = vld1q_u8(input + 2);
-
-    // mc >= l_nc, mc > r_nc
-    mask = vandq_u8(mask, vcgeq_u8(vc, l_nc));
-    mask = vandq_u8(mask, vcgtq_u8(vc, r_nc));
-
-    input += input_stride;
-
-    // Row2
-    l_nc = vld1q_u8(input);
-    m_nc = vld1q_u8(input + 1);
-    r_nc = vld1q_u8(input + 2);
-
-    // mc > l_nc, mc > m_nc, mc > r_nc
-    mask = vandq_u8(mask, vcgtq_u8(vc, l_nc));
-    mask = vandq_u8(mask, vcgtq_u8(vc, m_nc));
-    mask = vandq_u8(mask, vcgtq_u8(vc, r_nc));
-
-    static const uint8x16_t zero = vdupq_n_u8(0);
-
-    // Store
-    vst1q_u8(output, vbslq_u8(mask, vc, zero));
-}
-} // namespace
-
-NENonMaximaSuppression3x3Kernel::NENonMaximaSuppression3x3Kernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr)
-{
-}
-
-BorderSize NENonMaximaSuppression3x3Kernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void NENonMaximaSuppression3x3Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-    _input  = input;
-    _output = output;
-
-    if(input->info()->data_type() == DataType::U8)
-    {
-        _func = &non_maxima_suppression3x3_U8_U8;
-    }
-    else
-    {
-        _func = &non_maxima_suppression3x3_FLOAT_FLOAT;
-    }
-
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-    const unsigned int     num_elems_read_per_iteration      = 16 + 2 * border_size().left + (input->info()->data_type() == DataType::U8 ? 0 : 3);
-    constexpr unsigned int num_elems_written_per_iteration   = 16;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
-
-void NENonMaximaSuppression3x3Kernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-    Iterator input(_input, window);
-    Iterator output(_output, window);
-
-    const size_t input_stride = _input->info()->strides_in_bytes()[1] / element_size_from_data_type(_input->info()->data_type());
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        _func(input.ptr(), output.ptr(), input_stride);
-    },
-    input, output);
-}
diff --git a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
deleted file mode 100644
index 4194dac..0000000
--- a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H
-#define ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface to perform Non-Maxima suppression over a 3x3 window using Neon
- *
- */
-class NENonMaximaSuppression3x3Kernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NENonMaximaSuppression3x3Kernel";
-    }
-    /** Default constructor */
-    NENonMaximaSuppression3x3Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NENonMaximaSuppression3x3Kernel(const NENonMaximaSuppression3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NENonMaximaSuppression3x3Kernel &operator=(const NENonMaximaSuppression3x3Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NENonMaximaSuppression3x3Kernel(NENonMaximaSuppression3x3Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NENonMaximaSuppression3x3Kernel &operator=(NENonMaximaSuppression3x3Kernel &&) = default;
-    /** Default destructor */
-    ~NENonMaximaSuppression3x3Kernel() = default;
-
-    /** Initialise the kernel's sources, destinations and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8/F32
-     * @param[out] output           Destination tensor. Data types supported: same as @p input
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-protected:
-    /** Common signature for all the specialised non-maxima suppression 3x3 functions
-     *
-     * @param[in]  input_ptr    Pointer to the input tensor.
-     * @param[out] output_ptr   Pointer to the output tensor
-     * @param[in]  input_stride Stride of the input tensor
-     */
-    using NonMaxSuppr3x3Function = void(const void *__restrict input_ptr, void *__restrict output_ptr, const uint32_t input_stride);
-
-    NonMaxSuppr3x3Function *_func;   /**< Non-Maxima suppression function to use for the particular tensor types passed to configure() */
-    const ITensor          *_input;  /**< Source tensor */
-    ITensor                *_output; /**< Destination tensor */
-};
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/** Neon kernel to perform Non-Maxima suppression 3x3 with intermediate results in FP16 if the input data type is FP32
- */
-class NENonMaximaSuppression3x3FP16Kernel : public NENonMaximaSuppression3x3Kernel
-{
-public:
-    const char *name() const override
-    {
-        return "NENonMaximaSuppression3x3FP16Kernel";
-    }
-    /** Initialise the kernel's sources, destinations and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8/F32.
-     * @param[out] output           Destination tensor. Data types supported: same as @p input
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-};
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-/** Neon kernel to perform Non-Maxima suppression 3x3 with intermediate results in FP16 if the input data type is FP32 */
-using NENonMaximaSuppression3x3FP16Kernel = NENonMaximaSuppression3x3Kernel;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-} // namespace arm_compute
-#endif /* _ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H */
diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp
new file mode 100644
index 0000000..24d0dd8
--- /dev/null
+++ b/src/core/NEON/kernels/NERemapKernel.cpp
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/NEON/kernels/NERemapKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/ScaleHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+inline int32x4_t offset_nearest_interpolation(const float *mapx_ptr, const float *mapy_ptr, const float32x4_t &width, const float32x4_t &height, const int32x4_t &stride)
+{
+    const float32x4_t lowerxy = vdupq_n_f32(-1.f);
+
+    float32x4_t x = vld1q_f32(mapx_ptr);
+    float32x4_t y = vld1q_f32(mapy_ptr);
+
+    // Clamp x coordinates
+    x = vmaxq_f32(lowerxy, vminq_f32(x, width));
+    y = vmaxq_f32(lowerxy, vminq_f32(y, height));
+
+    const int32x4_t x_s32 = vcvtq_s32_f32(x);
+    const int32x4_t y_s32 = vcvtq_s32_f32(y);
+
+    return vmlaq_s32(x_s32, y_s32, stride);
+}
+
+} // namespace
+
+NERemapKernel::NERemapKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _map_x(nullptr), _map_y(nullptr)
+{
+}
+
+BorderSize NERemapKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void NERemapKernel::configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
+
+    _input  = input;
+    _output = output;
+    _map_x  = map_x;
+    _map_y  = map_y;
+
+    switch(policy)
+    {
+        case InterpolationPolicy::NEAREST_NEIGHBOR:
+        {
+            _func = &NERemapKernel::remap_nearest;
+            break;
+        }
+        case InterpolationPolicy::BILINEAR:
+        {
+            _func = &NERemapKernel::remap_bilinear;
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Unsupported interpolation mode");
+            break;
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    const int total_right  = ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration);
+    const int access_right = total_right + (((total_right - input->info()->dimension(0)) == 0) ? border_size().right : 0);
+
+    AccessWindowStatic input_access(input->info(), -border_size().left, -border_size().top, access_right, input->info()->dimension(1) + border_size().bottom);
+
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal mapx_access(map_x->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal mapy_access(map_y->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, mapx_access, mapy_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NERemapKernel::remap_nearest(const Window &window)
+{
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+    Iterator mapx(_map_x, window);
+    Iterator mapy(_map_y, window);
+
+    const float32x4_t width     = vdupq_n_f32(static_cast<float>(_input->info()->dimension(0)));
+    const float32x4_t height    = vdupq_n_f32(static_cast<float>(_input->info()->dimension(1)));
+    const int32x4_t   in_stride = vdupq_n_s32(static_cast<int32_t>(_input->info()->strides_in_bytes()[1]));
+
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        const auto     mapx_ptr = reinterpret_cast<const float *>(mapx.ptr());
+        const auto     mapy_ptr = reinterpret_cast<const float *>(mapy.ptr());
+        const uint8_t *in_ptr   = in.ptr();
+
+        const int32x4_t offset0 = offset_nearest_interpolation(mapx_ptr + 0, mapy_ptr + 0, width, height, in_stride);
+        const int32x4_t offset1 = offset_nearest_interpolation(mapx_ptr + 4, mapy_ptr + 4, width, height, in_stride);
+        const int32x4_t offset2 = offset_nearest_interpolation(mapx_ptr + 8, mapy_ptr + 8, width, height, in_stride);
+        const int32x4_t offset3 = offset_nearest_interpolation(mapx_ptr + 12, mapy_ptr + 12, width, height, in_stride);
+
+        uint8x16_t tmp = vdupq_n_u8(0);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 0)], tmp, 0);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 1)], tmp, 1);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 2)], tmp, 2);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 3)], tmp, 3);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 0)], tmp, 4);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 1)], tmp, 5);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 2)], tmp, 6);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 3)], tmp, 7);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 0)], tmp, 8);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 1)], tmp, 9);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 2)], tmp, 10);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 3)], tmp, 11);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 0)], tmp, 12);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 1)], tmp, 13);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 2)], tmp, 14);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 3)], tmp, 15);
+        vst1q_u8(out.ptr(), tmp);
+    },
+    in, out, mapx, mapy);
+}
+
+void NERemapKernel::remap_bilinear(const Window &window)
+{
+    using namespace scale_helpers;
+
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+    Iterator mapx(_map_x, window);
+    Iterator mapy(_map_y, window);
+
+    const size_t width     = _input->info()->dimension(0);
+    const size_t height    = _input->info()->dimension(1);
+    const size_t in_stride = _input->info()->strides_in_bytes()[1];
+
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        const auto     mapx_ptr = reinterpret_cast<float *>(mapx.ptr());
+        const auto     mapy_ptr = reinterpret_cast<float *>(mapy.ptr());
+        const uint8_t *in_ptr   = in.ptr();
+
+        uint8x8_t tmp0 = vdup_n_u8(0);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[0], mapy_ptr[0]), tmp0, 0);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[1], mapy_ptr[1]), tmp0, 1);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[2], mapy_ptr[2]), tmp0, 2);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[3], mapy_ptr[3]), tmp0, 3);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[4], mapy_ptr[4]), tmp0, 4);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[5], mapy_ptr[5]), tmp0, 5);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[6], mapy_ptr[6]), tmp0, 6);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[7], mapy_ptr[7]), tmp0, 7);
+
+        uint8x8_t tmp1 = vdup_n_u8(0);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[8], mapy_ptr[8]), tmp1, 0);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[9], mapy_ptr[9]), tmp1, 1);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[10], mapy_ptr[10]), tmp1, 2);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[11], mapy_ptr[11]), tmp1, 3);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[12], mapy_ptr[12]), tmp1, 4);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[13], mapy_ptr[13]), tmp1, 5);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[14], mapy_ptr[14]), tmp1, 6);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[15], mapy_ptr[15]), tmp1, 7);
+
+        vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
+    },
+    in, out, mapx, mapy);
+}
+
+void NERemapKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
diff --git a/src/core/NEON/kernels/NERemapKernel.h b/src/core/NEON/kernels/NERemapKernel.h
new file mode 100644
index 0000000..adc7f4b
--- /dev/null
+++ b/src/core/NEON/kernels/NERemapKernel.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEREMAPKERNEL_H
+#define ARM_COMPUTE_NEREMAPKERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Neon kernel to perform a remap on a tensor */
+class NERemapKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NERemapKernel";
+    }
+    /** Default constructor */
+    NERemapKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NERemapKernel(const NERemapKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NERemapKernel &operator=(const NERemapKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NERemapKernel(NERemapKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NERemapKernel &operator=(NERemapKernel &&) = default;
+    /** Default destructor */
+    ~NERemapKernel() = default;
+
+    /** Initialize the kernel's input, output and border mode.
+     *
+     * @param[in]  input  Source tensor. Data type supported: U8.
+     * @param[in]  map_x  Map for X coordinates. Data type supported: F32.
+     * @param[in]  map_y  Map for Y coordinates. Data type supported: F32.
+     * @param[out] output Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
+     * @param[in]  policy The interpolation type.
+     */
+    void configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+private:
+    /** function to perform nearest interpolation on the given window */
+    void remap_nearest(const Window &window);
+    /** function to perform bilinear interpolation on the given window */
+    void remap_bilinear(const Window &window);
+    /** Remap function to use for the particular interpolation type passed to configure() */
+    void (NERemapKernel::*_func)(const Window &window);
+
+    const ITensor *_input;  /**< Input image */
+    ITensor       *_output; /**< Output image */
+    const ITensor *_map_x;  /**< Input remap x coordinates */
+    const ITensor *_map_y;  /**< Input remap y coordinates */
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEREMAPKERNEL_H */
diff --git a/src/runtime/CL/functions/CLAbsoluteDifference.cpp b/src/runtime/CL/functions/CLAbsoluteDifference.cpp
deleted file mode 100644
index ff5b0a8..0000000
--- a/src/runtime/CL/functions/CLAbsoluteDifference.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLAbsoluteDifference.h"
-
-#include "src/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLAbsoluteDifference::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
-}
-
-void CLAbsoluteDifference::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
-    auto k = std::make_unique<CLAbsoluteDifferenceKernel>();
-    k->configure(compile_context, input1, input2, output);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/CL/functions/CLAccumulate.cpp b/src/runtime/CL/functions/CLAccumulate.cpp
deleted file mode 100644
index 44020fd..0000000
--- a/src/runtime/CL/functions/CLAccumulate.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLAccumulate.h"
-
-#include "src/core/CL/kernels/CLAccumulateKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLAccumulate::configure(const ICLTensor *input, ICLTensor *accum)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, accum);
-}
-
-void CLAccumulate::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *accum)
-{
-    auto k = std::make_unique<CLAccumulateKernel>();
-    k->configure(compile_context, input, accum);
-    _kernel = std::move(k);
-}
-
-void CLAccumulateWeighted::configure(const ICLTensor *input, float alpha, ICLTensor *accum)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, alpha, accum);
-}
-
-void CLAccumulateWeighted::configure(const CLCompileContext &compile_context, const ICLTensor *input, float alpha, ICLTensor *accum)
-{
-    auto k = std::make_unique<CLAccumulateWeightedKernel>();
-    k->configure(compile_context, input, alpha, accum);
-    _kernel = std::move(k);
-}
-
-void CLAccumulateSquared::configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, shift, accum);
-}
-
-void CLAccumulateSquared::configure(const CLCompileContext &compile_context, const ICLTensor *input, uint32_t shift, ICLTensor *accum)
-{
-    auto k = std::make_unique<CLAccumulateSquaredKernel>();
-    k->configure(compile_context, input, shift, accum);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/CL/functions/CLBox3x3.cpp b/src/runtime/CL/functions/CLBox3x3.cpp
deleted file mode 100644
index 09e24d1..0000000
--- a/src/runtime/CL/functions/CLBox3x3.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLBox3x3.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "src/core/CL/kernels/CLBox3x3Kernel.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLBox3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
-}
-
-void CLBox3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = std::make_unique<CLBox3x3Kernel>();
-    k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler->configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp
deleted file mode 100644
index 7e99a1b..0000000
--- a/src/runtime/CL/functions/CLCannyEdge.cpp
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLCannyEdge.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
-#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
-#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
-#include "src/core/CL/kernels/CLCannyEdgeKernel.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLSobel5x5Kernel.h"
-#include "src/core/CL/kernels/CLSobel7x7Kernel.h"
-
-using namespace arm_compute;
-
-CLCannyEdge::CLCannyEdge(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _sobel(),
-      _gradient(std::make_unique<CLGradientKernel>()),
-      _border_mag_gradient(std::make_unique<CLFillBorderKernel>()),
-      _non_max_suppr(std::make_unique<CLEdgeNonMaxSuppressionKernel>()),
-      _edge_trace(std::make_unique<CLEdgeTraceKernel>()),
-      _gx(),
-      _gy(),
-      _mag(),
-      _phase(),
-      _nonmax(),
-      _visited(),
-      _recorded(),
-      _l1_list_counter(),
-      _l1_stack(),
-      _output(nullptr)
-{
-}
-
-CLCannyEdge::~CLCannyEdge() = default;
-
-void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode,
-                            uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, upper_thr, lower_thr, gradient_size, norm_type, border_mode, constant_border_value);
-}
-
-void CLCannyEdge::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type,
-                            BorderMode border_mode,
-                            uint8_t    constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON((1 != norm_type) && (2 != norm_type));
-    ARM_COMPUTE_ERROR_ON((gradient_size != 3) && (gradient_size != 5) && (gradient_size != 7));
-    ARM_COMPUTE_ERROR_ON((lower_thr < 0) || (lower_thr >= upper_thr));
-
-    _output = output;
-
-    const unsigned int L1_hysteresis_stack_size = 8;
-    const TensorShape  shape                    = input->info()->tensor_shape();
-
-    TensorInfo gradient_info;
-    TensorInfo info;
-
-    // Initialize images
-    if(gradient_size < 7)
-    {
-        gradient_info.init(shape, 1, arm_compute::DataType::S16);
-        info.init(shape, 1, arm_compute::DataType::U16);
-    }
-    else
-    {
-        gradient_info.init(shape, 1, arm_compute::DataType::S32);
-        info.init(shape, 1, arm_compute::DataType::U32);
-    }
-
-    _gx.allocator()->init(gradient_info);
-    _gy.allocator()->init(gradient_info);
-    _mag.allocator()->init(info);
-    _nonmax.allocator()->init(info);
-
-    TensorInfo info_u8(shape, 1, arm_compute::DataType::U8);
-    _phase.allocator()->init(info_u8);
-    _l1_list_counter.allocator()->init(info_u8);
-
-    TensorInfo info_u32(shape, 1, arm_compute::DataType::U32);
-    _visited.allocator()->init(info_u32);
-    _recorded.allocator()->init(info_u32);
-
-    TensorShape shape_l1_stack = input->info()->tensor_shape();
-    shape_l1_stack.set(0, input->info()->dimension(0) * L1_hysteresis_stack_size);
-    TensorInfo info_s32(shape_l1_stack, 1, arm_compute::DataType::S32);
-    _l1_stack.allocator()->init(info_s32);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_gx);
-    _memory_group.manage(&_gy);
-
-    // Configure/Init sobelNxN
-    if(gradient_size == 3)
-    {
-        auto k = std::make_unique<CLSobel3x3>();
-        k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
-        _sobel = std::move(k);
-    }
-    else if(gradient_size == 5)
-    {
-        auto k = std::make_unique<CLSobel5x5>();
-        k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
-        _sobel = std::move(k);
-    }
-    else if(gradient_size == 7)
-    {
-        auto k = std::make_unique<CLSobel7x7>();
-        k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
-        _sobel = std::move(k);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR_VAR("Gradient size %d not supported", gradient_size);
-    }
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_mag);
-    _memory_group.manage(&_phase);
-
-    // Configure gradient
-    _gradient->configure(compile_context, &_gx, &_gy, &_mag, &_phase, norm_type);
-
-    // Allocate intermediate buffers
-    _gx.allocator()->allocate();
-    _gy.allocator()->allocate();
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_nonmax);
-
-    // Configure non-maxima suppression
-    _non_max_suppr->configure(compile_context, &_mag, &_phase, &_nonmax, lower_thr, border_mode == BorderMode::UNDEFINED);
-
-    // Allocate intermediate buffers
-    _phase.allocator()->allocate();
-
-    // Fill border around magnitude image as non-maxima suppression will access
-    // it. If border mode is undefined filling the border is a nop.
-    _border_mag_gradient->configure(compile_context, &_mag, _non_max_suppr->border_size(), border_mode, constant_border_value);
-
-    // Allocate intermediate buffers
-    _mag.allocator()->allocate();
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_visited);
-    _memory_group.manage(&_recorded);
-    _memory_group.manage(&_l1_stack);
-    _memory_group.manage(&_l1_list_counter);
-
-    // Configure edge tracing
-    _edge_trace->configure(compile_context, &_nonmax, output, upper_thr, lower_thr, &_visited, &_recorded, &_l1_stack, &_l1_list_counter);
-
-    // Allocate intermediate buffers
-    _visited.allocator()->allocate();
-    _recorded.allocator()->allocate();
-    _l1_stack.allocator()->allocate();
-    _l1_list_counter.allocator()->allocate();
-    _nonmax.allocator()->allocate();
-}
-
-void CLCannyEdge::run()
-{
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Run sobel
-    _sobel->run();
-
-    // Run phase and magnitude calculation
-    CLScheduler::get().enqueue(*_gradient, false);
-
-    // Fill border before non-maxima suppression. Nop for border mode undefined.
-    CLScheduler::get().enqueue(*_border_mag_gradient, false);
-
-    // Run non max suppresion
-    _nonmax.clear(CLScheduler::get().queue());
-    CLScheduler::get().enqueue(*_non_max_suppr, false);
-
-    // Clear temporary structures and run edge trace
-    _output->clear(CLScheduler::get().queue());
-    _visited.clear(CLScheduler::get().queue());
-    _recorded.clear(CLScheduler::get().queue());
-    _l1_list_counter.clear(CLScheduler::get().queue());
-    _l1_stack.clear(CLScheduler::get().queue());
-    CLScheduler::get().enqueue(*_edge_trace, true);
-}
diff --git a/src/runtime/CL/functions/CLChannelCombine.cpp b/src/runtime/CL/functions/CLChannelCombine.cpp
deleted file mode 100644
index 543de9c..0000000
--- a/src/runtime/CL/functions/CLChannelCombine.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLChannelCombine.h"
-
-#include "src/core/CL/kernels/CLChannelCombineKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLChannelCombine::configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), plane0, plane1, plane2, plane3, output);
-}
-
-void CLChannelCombine::configure(const CLCompileContext &compile_context, const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
-{
-    auto k = std::make_unique<CLChannelCombineKernel>();
-    k->configure(compile_context, plane0, plane1, plane2, plane3, output);
-    _kernel = std::move(k);
-}
-
-void CLChannelCombine::configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), plane0, plane1, plane2, output);
-}
-
-void CLChannelCombine::configure(const CLCompileContext &compile_context, const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
-{
-    auto k = std::make_unique<CLChannelCombineKernel>();
-    k->configure(compile_context, plane0, plane1, plane2, output);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/CL/functions/CLChannelExtract.cpp b/src/runtime/CL/functions/CLChannelExtract.cpp
deleted file mode 100644
index 645fc05..0000000
--- a/src/runtime/CL/functions/CLChannelExtract.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLChannelExtract.h"
-
-#include "src/core/CL/kernels/CLChannelExtractKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLChannelExtract::configure(const ICLTensor *input, Channel channel, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, channel, output);
-}
-
-void CLChannelExtract::configure(const CLCompileContext &compile_context, const ICLTensor *input, Channel channel, ICLTensor *output)
-{
-    auto k = std::make_unique<CLChannelExtractKernel>();
-    k->configure(compile_context, input, channel, output);
-    _kernel = std::move(k);
-}
-
-void CLChannelExtract::configure(const ICLMultiImage *input, Channel channel, ICLImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, channel, output);
-}
-
-void CLChannelExtract::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, Channel channel, ICLImage *output)
-{
-    auto k = std::make_unique<CLChannelExtractKernel>();
-    k->configure(compile_context, input, channel, output);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/CL/functions/CLColorConvert.cpp b/src/runtime/CL/functions/CLColorConvert.cpp
deleted file mode 100644
index 9aeeb65..0000000
--- a/src/runtime/CL/functions/CLColorConvert.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLColorConvert.h"
-
-#include "src/core/CL/kernels/CLColorConvertKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLColorConvert::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    auto k = std::make_unique<CLColorConvertKernel>();
-    k->configure(compile_context, input, output);
-    _kernel = std::move(k);
-}
-
-void CLColorConvert::configure(const ICLImage *input, ICLMultiImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLMultiImage *output)
-{
-    auto k = std::make_unique<CLColorConvertKernel>();
-    k->configure(compile_context, input, output);
-    _kernel = std::move(k);
-}
-
-void CLColorConvert::configure(const ICLMultiImage *input, ICLImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLImage *output)
-{
-    auto k = std::make_unique<CLColorConvertKernel>();
-    k->configure(compile_context, input, output);
-    _kernel = std::move(k);
-}
-
-void CLColorConvert::configure(const ICLMultiImage *input, ICLMultiImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLMultiImage *output)
-{
-    auto k = std::make_unique<CLColorConvertKernel>();
-    k->configure(compile_context, input, output);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp
deleted file mode 100644
index ffc7cda..0000000
--- a/src/runtime/CL/functions/CLConvolution.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLConvolution.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/ITensorAllocator.h"
-#include "src/core/CL/kernels/CLConvolutionKernel.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLConvolution3x3::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, scale, border_mode, constant_border_value);
-}
-
-void CLConvolution3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode,
-                                 uint8_t constant_border_value)
-{
-    auto k = std::make_unique<CLConvolution3x3Kernel>();
-    k->configure(compile_context, input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
-
-template <unsigned int matrix_size>
-CLConvolutionSquare<matrix_size>::CLConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(std::make_unique<CLSeparableConvolutionHorKernel<matrix_size>>()),
-      _kernel_vert(std::make_unique<CLSeparableConvolutionVertKernel<matrix_size>>()), _kernel(std::make_unique<CLConvolutionKernel<matrix_size>>()), _border_handler(std::make_unique<CLFillBorderKernel>())
-{
-}
-
-template <unsigned int matrix_size>
-CLConvolutionSquare<matrix_size>::~CLConvolutionSquare() = default;
-
-template <unsigned int matrix_size>
-void CLConvolutionSquare<matrix_size>::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode,
-                                                 uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, scale, border_mode, constant_border_value);
-}
-
-template <unsigned int matrix_size>
-void CLConvolutionSquare<matrix_size>::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode,
-                                                 uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(conv == nullptr);
-    std::array<int16_t, matrix_size> conv_col{ 0 };
-    std::array<int16_t, matrix_size> conv_row{ 0 };
-    _is_separable = separate_matrix(conv, conv_col.data(), conv_row.data(), matrix_size);
-
-    if(_is_separable)
-    {
-        std::pair<DataType, DataType> type_pair = data_type_for_convolution(conv_col.data(), conv_row.data(), matrix_size);
-        _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, type_pair.first));
-
-        // Manage intermediate buffers
-        _memory_group.manage(&_tmp);
-
-        if(scale == 0)
-        {
-            scale = calculate_matrix_scale(conv, matrix_size);
-        }
-
-        _kernel_hor->configure(compile_context, input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED);
-        _kernel_vert->configure(compile_context, &_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED, type_pair.second);
-        _border_handler->configure(compile_context, input, _kernel_hor->border_size(), border_mode, PixelValue(constant_border_value));
-
-        // Allocate intermediate buffer
-        _tmp.allocator()->allocate();
-    }
-    else
-    {
-        _kernel->configure(compile_context, input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
-        _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-    }
-}
-
-template <unsigned int matrix_size>
-void                   CLConvolutionSquare<matrix_size>::run()
-{
-    CLScheduler::get().enqueue(*_border_handler);
-
-    if(_is_separable)
-    {
-        MemoryGroupResourceScope scope_mg(_memory_group);
-
-        CLScheduler::get().enqueue(*_kernel_hor, false);
-        CLScheduler::get().enqueue(*_kernel_vert);
-    }
-    else
-    {
-        CLScheduler::get().enqueue(*_kernel);
-    }
-}
-
-template class arm_compute::CLConvolutionSquare<5>;
-template class arm_compute::CLConvolutionSquare<7>;
-template class arm_compute::CLConvolutionSquare<9>;
-
-void CLConvolutionRectangle::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, rows, cols, scale, border_mode, constant_border_value);
-}
-
-void CLConvolutionRectangle::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale,
-                                       BorderMode border_mode, uint8_t constant_border_value)
-{
-    border_mode = (border_mode == BorderMode::UNDEFINED) ? BorderMode::CONSTANT : border_mode;
-    auto k      = std::make_unique<CLConvolutionRectangleKernel>();
-    k->configure(compile_context, input, output, conv, rows, cols, scale, false);
-    _kernel = std::move(k);
-    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/CL/functions/CLDerivative.cpp b/src/runtime/CL/functions/CLDerivative.cpp
deleted file mode 100644
index 2e3ecf7..0000000
--- a/src/runtime/CL/functions/CLDerivative.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLDerivative.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "src/core/CL/kernels/CLDerivativeKernel.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLDerivative::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
-}
-
-void CLDerivative::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = std::make_unique<CLDerivativeKernel>();
-    k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler->configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/CL/functions/CLDilate.cpp b/src/runtime/CL/functions/CLDilate.cpp
deleted file mode 100644
index 92c5cc7..0000000
--- a/src/runtime/CL/functions/CLDilate.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLDilate.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "src/core/CL/kernels/CLDilateKernel.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLDilate::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
-}
-
-void CLDilate::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = std::make_unique<CLDilateKernel>();
-    k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler->configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/CL/functions/CLEqualizeHistogram.cpp b/src/runtime/CL/functions/CLEqualizeHistogram.cpp
deleted file mode 100644
index 11607cf..0000000
--- a/src/runtime/CL/functions/CLEqualizeHistogram.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLEqualizeHistogram.h"
-
-#include "arm_compute/core/CL/ICLDistribution1D.h"
-#include "arm_compute/core/CL/ICLLut.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLHistogramKernel.h"
-#include "src/core/CL/kernels/CLTableLookupKernel.h"
-
-#include <algorithm>
-#include <cmath>
-#include <cstddef>
-#include <numeric>
-
-using namespace arm_compute;
-
-namespace
-{
-void calculate_cum_dist_and_lut(CLDistribution1D &dist, CLDistribution1D &cum_dist, CLLut &lut)
-{
-    dist.map(true);
-    cum_dist.map(true);
-    lut.map(true);
-
-    const uint32_t *dist_ptr     = dist.buffer();
-    uint32_t       *cum_dist_ptr = cum_dist.buffer();
-    uint8_t        *lut_ptr      = lut.buffer();
-
-    ARM_COMPUTE_ERROR_ON(dist_ptr == nullptr);
-    ARM_COMPUTE_ERROR_ON(cum_dist_ptr == nullptr);
-    ARM_COMPUTE_ERROR_ON(lut_ptr == nullptr);
-
-    // Calculate cumulative distribution
-    std::partial_sum(dist_ptr, dist_ptr + 256, cum_dist_ptr);
-
-    // Get the number of pixels that have the lowest value in the input image
-    const uint32_t num_lowest_pixels = *std::find_if(dist_ptr, dist_ptr + 256, [](const uint32_t &v)
-    {
-        return v > 0;
-    });
-    const size_t image_size = cum_dist_ptr[255];
-
-    if(image_size == num_lowest_pixels)
-    {
-        std::iota(lut_ptr, lut_ptr + 256, 0);
-    }
-    else
-    {
-        const float diff = image_size - num_lowest_pixels;
-
-        for(size_t i = 0; i < 256; ++i)
-        {
-            lut_ptr[i] = lround((cum_dist_ptr[i] - num_lowest_pixels) / diff * 255.f);
-        }
-    }
-
-    dist.unmap();
-    cum_dist.unmap();
-    lut.unmap();
-}
-} // namespace
-
-CLEqualizeHistogram::CLEqualizeHistogram()
-    : _histogram_kernel(std::make_unique<CLHistogramKernel>()),
-      _border_histogram_kernel(std::make_unique<CLHistogramBorderKernel>()),
-      _map_histogram_kernel(std::make_unique<CLTableLookupKernel>()),
-      _hist(nr_bins, 0, max_range),
-      _cum_dist(nr_bins, 0, max_range),
-      _cd_lut(nr_bins, DataType::U8)
-{
-}
-
-CLEqualizeHistogram::~CLEqualizeHistogram() = default;
-
-void CLEqualizeHistogram::configure(const ICLImage *input, ICLImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLEqualizeHistogram::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLImage *output)
-{
-    _histogram_kernel->configure(compile_context, input, &_hist);
-    _border_histogram_kernel->configure(compile_context, input, &_hist);
-    _map_histogram_kernel->configure(compile_context, input, &_cd_lut, output);
-}
-
-void CLEqualizeHistogram::run()
-{
-    // Calculate histogram of input.
-    CLScheduler::get().enqueue(*_histogram_kernel, false);
-
-    // Calculate remaining pixels when image is not multiple of the elements of histogram kernel
-    CLScheduler::get().enqueue(*_border_histogram_kernel, false);
-
-    // Calculate cumulative distribution of histogram and create LUT.
-    calculate_cum_dist_and_lut(_hist, _cum_dist, _cd_lut);
-
-    // Map input to output using created LUT.
-    CLScheduler::get().enqueue(*_map_histogram_kernel);
-}
diff --git a/src/runtime/CL/functions/CLErode.cpp b/src/runtime/CL/functions/CLErode.cpp
deleted file mode 100644
index 29551fc..0000000
--- a/src/runtime/CL/functions/CLErode.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLErode.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "src/core/CL/kernels/CLErodeKernel.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLErode::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
-}
-
-void CLErode::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = std::make_unique<CLErodeKernel>();
-    k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler->configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/CL/functions/CLFastCorners.cpp b/src/runtime/CL/functions/CLFastCorners.cpp
deleted file mode 100644
index a3a62d6..0000000
--- a/src/runtime/CL/functions/CLFastCorners.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLFastCorners.h"
-
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/ITensorAllocator.h"
-#include "src/core/CL/kernels/CLFastCornersKernel.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-
-#include <algorithm>
-#include <cstring>
-
-using namespace arm_compute;
-
-CLFastCorners::CLFastCorners(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)),
-      _fast_corners_kernel(std::make_unique<CLFastCornersKernel>()),
-      _suppr_func(),
-      _copy_array_kernel(std::make_unique<CLCopyToArrayKernel>()),
-      _output(),
-      _suppr(),
-      _win(),
-      _non_max(false),
-      _num_corners(nullptr),
-      _num_buffer(),
-      _corners(nullptr),
-      _constant_border_value(0)
-{
-}
-
-CLFastCorners::~CLFastCorners() = default;
-
-void CLFastCorners::configure(const ICLImage *input, float threshold, bool nonmax_suppression, ICLKeyPointArray *corners,
-                              unsigned int *num_corners, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, threshold, nonmax_suppression, corners, num_corners, border_mode, constant_border_value);
-}
-
-void CLFastCorners::configure(const CLCompileContext &compile_context, const ICLImage *input, float threshold, bool nonmax_suppression, ICLKeyPointArray *corners,
-                              unsigned int *num_corners, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON(BorderMode::UNDEFINED != border_mode);
-    ARM_COMPUTE_ERROR_ON(nullptr == corners);
-    ARM_COMPUTE_ERROR_ON(threshold < 1 && threshold > 255);
-
-    TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::U8);
-    _output.allocator()->init(tensor_info);
-
-    _non_max               = nonmax_suppression;
-    _num_corners           = num_corners;
-    _corners               = corners;
-    _num_buffer            = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(unsigned int));
-    _constant_border_value = constant_border_value;
-
-    const bool update_number = (nullptr != _num_corners);
-
-    _memory_group.manage(&_output);
-    _fast_corners_kernel->configure(compile_context, input, &_output, threshold, nonmax_suppression, border_mode);
-
-    if(!_non_max)
-    {
-        _copy_array_kernel->configure(compile_context, &_output, update_number, _corners, &_num_buffer);
-    }
-    else
-    {
-        _suppr.allocator()->init(tensor_info);
-        _memory_group.manage(&_suppr);
-
-        _suppr_func.configure(compile_context, &_output, &_suppr, border_mode);
-        _copy_array_kernel->configure(compile_context, &_suppr, update_number, _corners, &_num_buffer);
-
-        _suppr.allocator()->allocate();
-    }
-
-    // Allocate intermediate tensors
-    _output.allocator()->allocate();
-}
-
-void CLFastCorners::run()
-{
-    cl::CommandQueue q = CLScheduler::get().queue();
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    if(_non_max)
-    {
-        ARM_COMPUTE_ERROR_ON_MSG(_output.cl_buffer().get() == nullptr, "Unconfigured function");
-        const auto out_buffer = static_cast<unsigned char *>(q.enqueueMapBuffer(_output.cl_buffer(), CL_TRUE, CL_MAP_WRITE, 0, _output.info()->total_size()));
-        memset(out_buffer, 0, _output.info()->total_size());
-        q.enqueueUnmapMemObject(_output.cl_buffer(), out_buffer);
-    }
-
-    CLScheduler::get().enqueue(*_fast_corners_kernel, false);
-
-    if(_non_max)
-    {
-        _suppr_func.run();
-    }
-
-    CLScheduler::get().enqueue(*_copy_array_kernel, false);
-
-    unsigned int get_num_corners = 0;
-    q.enqueueReadBuffer(_num_buffer, CL_TRUE, 0, sizeof(unsigned int), &get_num_corners);
-
-    size_t corner_size = std::min(static_cast<size_t>(get_num_corners), _corners->max_num_values());
-
-    _corners->resize(corner_size);
-
-    if(_num_corners != nullptr)
-    {
-        *_num_corners = get_num_corners;
-    }
-
-    q.flush();
-}
diff --git a/src/runtime/CL/functions/CLGaussian3x3.cpp b/src/runtime/CL/functions/CLGaussian3x3.cpp
deleted file mode 100644
index 8eeade2..0000000
--- a/src/runtime/CL/functions/CLGaussian3x3.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLGaussian3x3.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLGaussian3x3Kernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLGaussian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
-}
-
-void CLGaussian3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = std::make_unique<CLGaussian3x3Kernel>();
-    k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/CL/functions/CLGaussian5x5.cpp b/src/runtime/CL/functions/CLGaussian5x5.cpp
deleted file mode 100644
index ee72fcb..0000000
--- a/src/runtime/CL/functions/CLGaussian5x5.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/ITensorAllocator.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLGaussian5x5Kernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-CLGaussian5x5::CLGaussian5x5(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)),
-      _kernel_hor(std::make_unique<CLGaussian5x5HorKernel>()),
-      _kernel_vert(std::make_unique<CLGaussian5x5VertKernel>()),
-      _border_handler(std::make_unique<CLFillBorderKernel>()),
-      _tmp()
-{
-}
-
-CLGaussian5x5::~CLGaussian5x5() = default;
-
-void CLGaussian5x5::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
-}
-
-void CLGaussian5x5::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-
-    _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, DataType::U16));
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_tmp);
-
-    // Configure kernels
-    _kernel_hor->configure(compile_context, input, &_tmp, border_mode == BorderMode::UNDEFINED);
-    _kernel_vert->configure(compile_context, &_tmp, output, border_mode == BorderMode::UNDEFINED);
-    _border_handler->configure(compile_context, input, _kernel_hor->border_size(), border_mode, PixelValue(constant_border_value));
-
-    // Allocate intermediate buffers
-    _tmp.allocator()->allocate();
-}
-
-void CLGaussian5x5::run()
-{
-    CLScheduler::get().enqueue(*_border_handler, false);
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    CLScheduler::get().enqueue(*_kernel_hor, false);
-    CLScheduler::get().enqueue(*_kernel_vert);
-}
diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp
deleted file mode 100644
index 9fe35f6..0000000
--- a/src/runtime/CL/functions/CLGaussianPyramid.cpp
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/runtime/CL/CLPyramid.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLGaussian5x5Kernel.h"
-#include "src/core/CL/kernels/CLGaussianPyramidKernel.h"
-#include "src/core/CL/kernels/CLScaleKernel.h"
-
-#include <cstddef>
-
-using namespace arm_compute;
-
-CLGaussianPyramid::CLGaussianPyramid()
-    : _input(nullptr), _pyramid(nullptr), _tmp()
-{
-}
-
-CLGaussianPyramid::~CLGaussianPyramid() = default;
-
-CLGaussianPyramidHalf::CLGaussianPyramidHalf() // NOLINT
-    : _horizontal_border_handler(),
-      _vertical_border_handler(),
-      _horizontal_reduction(),
-      _vertical_reduction()
-{
-}
-
-CLGaussianPyramidHalf::~CLGaussianPyramidHalf() = default;
-
-void CLGaussianPyramidHalf::configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, pyramid, border_mode, constant_border_value);
-}
-
-void CLGaussianPyramidHalf::configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(pyramid == nullptr);
-    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
-    ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_HALF != pyramid->info()->scale());
-
-    // Constant value to use for vertical fill border when the border mode is CONSTANT
-    const uint16_t pixel_value_u16 = static_cast<uint16_t>(constant_border_value) * 2 + static_cast<uint16_t>(constant_border_value) * 8 + static_cast<uint16_t>(constant_border_value) * 6;
-
-    /* Get number of pyramid levels */
-    const size_t num_levels = pyramid->info()->num_levels();
-
-    _input   = input;
-    _pyramid = pyramid;
-
-    if(num_levels > 1)
-    {
-        _horizontal_border_handler.reserve(num_levels - 1);
-        _vertical_border_handler.reserve(num_levels - 1);
-        _horizontal_reduction.reserve(num_levels - 1);
-        _vertical_reduction.reserve(num_levels - 1);
-
-        // Apply half scale to the X dimension of the tensor shape
-        TensorShape tensor_shape = pyramid->info()->tensor_shape();
-        tensor_shape.set(0, (pyramid->info()->width() + 1) * SCALE_PYRAMID_HALF);
-
-        PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_HALF, tensor_shape, Format::U16);
-        _tmp.init(pyramid_info);
-
-        for(size_t i = 0; i < num_levels - 1; ++i)
-        {
-            /* Configure horizontal kernel */
-            _horizontal_reduction.emplace_back(std::make_unique<CLGaussianPyramidHorKernel>());
-            _horizontal_reduction.back()->configure(compile_context, _pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
-
-            /* Configure vertical kernel */
-            _vertical_reduction.emplace_back(std::make_unique<CLGaussianPyramidVertKernel>());
-            _vertical_reduction.back()->configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
-
-            /* Configure border */
-            _horizontal_border_handler.emplace_back(std::make_unique<CLFillBorderKernel>());
-            _horizontal_border_handler.back()->configure(compile_context, _pyramid->get_pyramid_level(i), _horizontal_reduction.back()->border_size(), border_mode, PixelValue(constant_border_value));
-
-            /* Configure border */
-            _vertical_border_handler.emplace_back(std::make_unique<CLFillBorderKernel>());
-            _vertical_border_handler.back()->configure(compile_context, _tmp.get_pyramid_level(i), _vertical_reduction.back()->border_size(), border_mode, PixelValue(pixel_value_u16));
-        }
-        _tmp.allocate();
-    }
-}
-
-void CLGaussianPyramidHalf::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_pyramid == nullptr, "Unconfigured function");
-
-    /* Get number of pyramid levels */
-    const size_t num_levels = _pyramid->info()->num_levels();
-
-    /* The first level of the pyramid has the input image */
-    _pyramid->get_pyramid_level(0)->map(CLScheduler::get().queue(), true /* blocking */);
-    _input->map(CLScheduler::get().queue(), true /* blocking */);
-    _pyramid->get_pyramid_level(0)->copy_from(*_input);
-
-    _input->unmap(CLScheduler::get().queue());
-    _pyramid->get_pyramid_level(0)->unmap(CLScheduler::get().queue());
-
-    for(unsigned int i = 0; i < num_levels - 1; ++i)
-    {
-        CLScheduler::get().enqueue(*_horizontal_border_handler[i], false);
-        CLScheduler::get().enqueue(*_horizontal_reduction[i], false);
-        CLScheduler::get().enqueue(*_vertical_border_handler[i], false);
-        CLScheduler::get().enqueue(*_vertical_reduction[i], false);
-    }
-}
-
-CLGaussianPyramidOrb::CLGaussianPyramidOrb() // NOLINT
-    : _gauss5x5(),
-      _scale_nearest()
-{
-}
-
-void CLGaussianPyramidOrb::configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, pyramid, border_mode, constant_border_value);
-}
-
-void CLGaussianPyramidOrb::configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
-    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
-    ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_ORB != pyramid->info()->scale());
-
-    /* Get number of pyramid levels */
-    const size_t num_levels = pyramid->info()->num_levels();
-
-    _input   = input;
-    _pyramid = pyramid;
-
-    if(num_levels > 1)
-    {
-        _gauss5x5.resize(num_levels - 1);
-        _scale_nearest.reserve(num_levels - 1);
-
-        PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
-
-        _tmp.init(pyramid_info);
-
-        for(size_t i = 0; i < num_levels - 1; ++i)
-        {
-            /* Configure gaussian 5x5 */
-            _gauss5x5[i].configure(compile_context, _pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
-
-            /* Configure scale image kernel */
-            _scale_nearest.emplace_back(std::make_unique<CLScaleKernel>());
-            _scale_nearest.back()->configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), ScaleKernelInfo{ InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, PixelValue(), SamplingPolicy::CENTER });
-        }
-
-        _tmp.allocate();
-    }
-}
-
-void CLGaussianPyramidOrb::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_pyramid == nullptr, "Unconfigured function");
-
-    /* Get number of pyramid levels */
-    const size_t num_levels = _pyramid->info()->num_levels();
-
-    /* The first level of the pyramid has the input image */
-    _pyramid->get_pyramid_level(0)->map(CLScheduler::get().queue(), true /* blocking */);
-    _input->map(CLScheduler::get().queue(), true /* blocking */);
-    _pyramid->get_pyramid_level(0)->copy_from(*_input);
-    _input->unmap(CLScheduler::get().queue());
-    _pyramid->get_pyramid_level(0)->unmap(CLScheduler::get().queue());
-
-    for(unsigned int i = 0; i < num_levels - 1; ++i)
-    {
-        _gauss5x5[i].run();
-        CLScheduler::get().enqueue(*_scale_nearest[i]);
-    }
-}
diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp
deleted file mode 100644
index 8d9ea17..0000000
--- a/src/runtime/CL/functions/CLHOGDescriptor.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLHOGDescriptor.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/HOGInfo.h"
-#include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLHOGDescriptorKernel.h"
-#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
-
-using namespace arm_compute;
-
-CLHOGDescriptor::CLHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)),
-      _gradient(),
-      _orient_bin(std::make_unique<CLHOGOrientationBinningKernel>()),
-      _block_norm(std::make_unique<CLHOGBlockNormalizationKernel>()),
-      _mag(),
-      _phase(),
-      _hog_space()
-{
-}
-
-CLHOGDescriptor::~CLHOGDescriptor() = default;
-
-void CLHOGDescriptor::configure(ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, hog, border_mode, constant_border_value);
-}
-
-void CLHOGDescriptor::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    ARM_COMPUTE_ERROR_ON(nullptr == hog);
-
-    const HOGInfo *hog_info = hog->info();
-    const size_t   width    = input->info()->dimension(Window::DimX);
-    const size_t   height   = input->info()->dimension(Window::DimY);
-    const size_t   num_bins = hog_info->num_bins();
-
-    Size2D cell_size = hog_info->cell_size();
-
-    // Calculate number of cells along the x and y directions for the hog_space
-    const size_t num_cells_x = width / cell_size.width;
-    const size_t num_cells_y = height / cell_size.height;
-
-    // TensorShape of the input image
-    const TensorShape &shape_img = input->info()->tensor_shape();
-
-    // TensorShape of the hog space
-    TensorShape shape_hog_space = input->info()->tensor_shape();
-    shape_hog_space.set(Window::DimX, num_cells_x);
-    shape_hog_space.set(Window::DimY, num_cells_y);
-
-    // Intitialize tensors for magnitude, phase and hog space
-    TensorInfo info_mag(shape_img, Format::S16);
-    _mag.allocator()->init(info_mag);
-
-    TensorInfo info_phase(shape_img, Format::U8);
-    _phase.allocator()->init(info_phase);
-
-    TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
-    _hog_space.allocator()->init(info_space);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_mag);
-    _memory_group.manage(&_phase);
-
-    // Initialise gradient kernel
-    _gradient.configure(compile_context, input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_hog_space);
-
-    // Initialise orientation binning kernel
-    _orient_bin->configure(compile_context, &_mag, &_phase, &_hog_space, hog->info());
-
-    // Initialize HOG norm kernel
-    _block_norm->configure(compile_context, &_hog_space, output, hog->info());
-
-    // Allocate intermediate tensors
-    _mag.allocator()->allocate();
-    _phase.allocator()->allocate();
-    _hog_space.allocator()->allocate();
-}
-
-void CLHOGDescriptor::run()
-{
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Run gradient
-    _gradient.run();
-
-    // Run orientation binning
-    CLScheduler::get().enqueue(*_orient_bin, false);
-
-    // Run block normalization
-    CLScheduler::get().enqueue(*_block_norm);
-}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGDetector.cpp b/src/runtime/CL/functions/CLHOGDetector.cpp
deleted file mode 100644
index 365021c..0000000
--- a/src/runtime/CL/functions/CLHOGDetector.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLHOGDetectorKernel.h"
-
-#include <algorithm>
-
-using namespace arm_compute;
-
-CLHOGDetector::CLHOGDetector()
-    : _hog_detector_kernel(std::make_unique<CLHOGDetectorKernel>()), _detection_windows(nullptr), _num_detection_windows()
-{
-}
-
-CLHOGDetector::~CLHOGDetector() = default;
-
-void CLHOGDetector::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, hog, detection_windows, detection_window_stride, threshold, idx_class);
-}
-
-void CLHOGDetector::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride,
-                              float threshold, size_t idx_class)
-{
-    _detection_windows = detection_windows;
-
-    // Allocate buffer for storing the number of detected objects
-    _num_detection_windows = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(unsigned int));
-
-    // Configure HOGDetectorKernel
-    _hog_detector_kernel->configure(compile_context, input, hog, detection_windows, &_num_detection_windows, detection_window_stride, threshold, idx_class);
-}
-
-void CLHOGDetector::run()
-{
-    cl::CommandQueue q = CLScheduler::get().queue();
-
-    // Reset number of detections
-    const unsigned int init_num_detection_windows = _detection_windows->num_values();
-    q.enqueueWriteBuffer(_num_detection_windows, CL_FALSE, 0, sizeof(unsigned int), &init_num_detection_windows);
-
-    // Run CLHOGDetectorKernel
-    CLScheduler::get().enqueue(*_hog_detector_kernel);
-
-    // Read number of detections
-    unsigned int num_detection_windows = 0;
-    q.enqueueReadBuffer(_num_detection_windows, CL_TRUE, 0, sizeof(unsigned int), &num_detection_windows);
-
-    // Update the number of values stored in _detection_windows
-    _detection_windows->resize(static_cast<size_t>(num_detection_windows));
-
-    q.flush();
-}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp
deleted file mode 100644
index f3aa527..0000000
--- a/src/runtime/CL/functions/CLHOGGradient.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
-
-using namespace arm_compute;
-
-CLHOGGradient::CLHOGGradient(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)),
-      _derivative(),
-      _mag_phase(std::make_unique<CLMagnitudePhaseKernel>()),
-      _gx(),
-      _gy()
-{
-}
-
-void CLHOGGradient::configure(ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_magnitude, output_phase, phase_type, border_mode, constant_border_value);
-}
-
-void CLHOGGradient::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode,
-                              uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_magnitude, 1, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_phase, 1, DataType::U8);
-
-    const TensorShape &shape_img = input->info()->tensor_shape();
-
-    // Allocate image memory
-    TensorInfo info(shape_img, Format::S16);
-    _gx.allocator()->init(info);
-    _gy.allocator()->init(info);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_gx);
-    _memory_group.manage(&_gy);
-
-    // Initialise derivate kernel
-    _derivative.configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
-
-    // Initialise magnitude/phase kernel
-    if(PhaseType::UNSIGNED == phase_type)
-    {
-        _mag_phase->configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::UNSIGNED);
-    }
-    else
-    {
-        _mag_phase->configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::SIGNED);
-    }
-
-    // Allocate intermediate tensors
-    _gx.allocator()->allocate();
-    _gy.allocator()->allocate();
-}
-
-void CLHOGGradient::run()
-{
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Run derivative
-    _derivative.run();
-
-    // Run magnitude/phase kernel
-    CLScheduler::get().enqueue(*_mag_phase);
-}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
deleted file mode 100644
index 2464e6c..0000000
--- a/src/runtime/CL/functions/CLHOGMultiDetection.cpp
+++ /dev/null
@@ -1,282 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLHOGMultiDetection.h"
-
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/runtime/CL/CLArray.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/Scheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLHOGDescriptorKernel.h"
-#include "src/core/CL/kernels/CLHOGDetectorKernel.h"
-#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
-
-using namespace arm_compute;
-
-CLHOGMultiDetection::CLHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _gradient_kernel(),
-      _orient_bin_kernel(),
-      _block_norm_kernel(),
-      _hog_detect_kernel(),
-      _non_maxima_kernel(),
-      _hog_space(),
-      _hog_norm_space(),
-      _detection_windows(),
-      _mag(),
-      _phase(),
-      _non_maxima_suppression(false),
-      _num_orient_bin_kernel(0),
-      _num_block_norm_kernel(0),
-      _num_hog_detect_kernel(0)
-{
-}
-
-CLHOGMultiDetection::~CLHOGMultiDetection() = default;
-
-void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, ICLSize2DArray *detection_window_strides, BorderMode border_mode,
-                                    uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, multi_hog, detection_windows, detection_window_strides, border_mode, constant_border_value, threshold, non_maxima_suppression,
-              min_distance);
-}
-
-void CLHOGMultiDetection::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows,
-                                    ICLSize2DArray *detection_window_strides, BorderMode border_mode,
-                                    uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_INVALID_MULTI_HOG(multi_hog);
-    ARM_COMPUTE_ERROR_ON(nullptr == detection_windows);
-    ARM_COMPUTE_ERROR_ON(detection_window_strides->num_values() != multi_hog->num_models());
-
-    const size_t       width      = input->info()->dimension(Window::DimX);
-    const size_t       height     = input->info()->dimension(Window::DimY);
-    const TensorShape &shape_img  = input->info()->tensor_shape();
-    const size_t       num_models = multi_hog->num_models();
-    PhaseType          phase_type = multi_hog->model(0)->info()->phase_type();
-
-    size_t prev_num_bins     = multi_hog->model(0)->info()->num_bins();
-    Size2D prev_cell_size    = multi_hog->model(0)->info()->cell_size();
-    Size2D prev_block_size   = multi_hog->model(0)->info()->block_size();
-    Size2D prev_block_stride = multi_hog->model(0)->info()->block_stride();
-
-    /* Check if CLHOGOrientationBinningKernel and CLHOGBlockNormalizationKernel kernels can be skipped for a specific HOG data-object
-     *
-     * 1) CLHOGOrientationBinningKernel and CLHOGBlockNormalizationKernel are skipped if the cell size and the number of bins don't change.
-     *        Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th
-     * 2) CLHOGBlockNormalizationKernel is skipped if the cell size, the number of bins and block size do not change.
-     *         Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th
-     *
-     * @note Since the orientation binning and block normalization kernels can be skipped, we need to keep track of the input to process for each kernel
-     *       with "input_orient_bin", "input_hog_detect" and "input_block_norm"
-     */
-    std::vector<size_t> input_orient_bin;
-    std::vector<size_t> input_hog_detect;
-    std::vector<std::pair<size_t, size_t>> input_block_norm;
-
-    input_orient_bin.push_back(0);
-    input_hog_detect.push_back(0);
-    input_block_norm.emplace_back(0, 0);
-
-    for(size_t i = 1; i < num_models; ++i)
-    {
-        size_t cur_num_bins     = multi_hog->model(i)->info()->num_bins();
-        Size2D cur_cell_size    = multi_hog->model(i)->info()->cell_size();
-        Size2D cur_block_size   = multi_hog->model(i)->info()->block_size();
-        Size2D cur_block_stride = multi_hog->model(i)->info()->block_stride();
-
-        if((cur_num_bins != prev_num_bins) || (cur_cell_size.width != prev_cell_size.width) || (cur_cell_size.height != prev_cell_size.height))
-        {
-            prev_num_bins     = cur_num_bins;
-            prev_cell_size    = cur_cell_size;
-            prev_block_size   = cur_block_size;
-            prev_block_stride = cur_block_stride;
-
-            // Compute orientation binning and block normalization kernels. Update input to process
-            input_orient_bin.push_back(i);
-            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
-        }
-        else if((cur_block_size.width != prev_block_size.width) || (cur_block_size.height != prev_block_size.height) || (cur_block_stride.width != prev_block_stride.width)
-                || (cur_block_stride.height != prev_block_stride.height))
-        {
-            prev_block_size   = cur_block_size;
-            prev_block_stride = cur_block_stride;
-
-            // Compute block normalization kernel. Update input to process
-            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
-        }
-
-        // Update input to process for hog detector kernel
-        input_hog_detect.push_back(input_block_norm.size() - 1);
-    }
-
-    _detection_windows      = detection_windows;
-    _non_maxima_suppression = non_maxima_suppression;
-    _num_orient_bin_kernel  = input_orient_bin.size(); // Number of CLHOGOrientationBinningKernel kernels to compute
-    _num_block_norm_kernel  = input_block_norm.size(); // Number of CLHOGBlockNormalizationKernel kernels to compute
-    _num_hog_detect_kernel  = input_hog_detect.size(); // Number of CLHOGDetector functions to compute
-
-    _orient_bin_kernel.reserve(_num_orient_bin_kernel);
-    _block_norm_kernel.reserve(_num_block_norm_kernel);
-    _hog_detect_kernel.resize(_num_hog_detect_kernel);
-    _hog_space.resize(_num_orient_bin_kernel);
-    _hog_norm_space.resize(_num_block_norm_kernel);
-
-    // Allocate tensors for magnitude and phase
-    TensorInfo info_mag(shape_img, Format::S16);
-    _mag.allocator()->init(info_mag);
-
-    TensorInfo info_phase(shape_img, Format::U8);
-    _phase.allocator()->init(info_phase);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_mag);
-    _memory_group.manage(&_phase);
-
-    // Initialise gradient kernel
-    _gradient_kernel.configure(compile_context, input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
-
-    // Configure NETensor for the HOG space and orientation binning kernel
-    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
-    {
-        const size_t idx_multi_hog = input_orient_bin[i];
-
-        // Get the corresponding cell size and number of bins
-        const Size2D &cell     = multi_hog->model(idx_multi_hog)->info()->cell_size();
-        const size_t  num_bins = multi_hog->model(idx_multi_hog)->info()->num_bins();
-
-        // Calculate number of cells along the x and y directions for the hog_space
-        const size_t num_cells_x = width / cell.width;
-        const size_t num_cells_y = height / cell.height;
-
-        // TensorShape of hog space
-        TensorShape shape_hog_space = input->info()->tensor_shape();
-        shape_hog_space.set(Window::DimX, num_cells_x);
-        shape_hog_space.set(Window::DimY, num_cells_y);
-
-        // Allocate HOG space
-        TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
-        _hog_space[i].allocator()->init(info_space);
-
-        // Manage intermediate buffers
-        _memory_group.manage(&_hog_space[i]);
-
-        // Initialise orientation binning kernel
-        _orient_bin_kernel.emplace_back(std::make_unique<CLHOGOrientationBinningKernel>());
-        _orient_bin_kernel.back()->configure(compile_context, &_mag, &_phase, &_hog_space[i], multi_hog->model(idx_multi_hog)->info());
-    }
-
-    // Allocate intermediate tensors
-    _mag.allocator()->allocate();
-    _phase.allocator()->allocate();
-
-    // Configure CLTensor for the normalized HOG space and block normalization kernel
-    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
-    {
-        const size_t idx_multi_hog  = input_block_norm[i].first;
-        const size_t idx_orient_bin = input_block_norm[i].second;
-
-        // Allocate normalized HOG space
-        TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
-        _hog_norm_space[i].allocator()->init(tensor_info);
-
-        // Manage intermediate buffers
-        _memory_group.manage(&_hog_norm_space[i]);
-
-        // Initialize block normalization kernel
-        _block_norm_kernel.emplace_back(std::make_unique<CLHOGBlockNormalizationKernel>());
-        _block_norm_kernel.back()->configure(compile_context, &_hog_space[idx_orient_bin], &_hog_norm_space[i], multi_hog->model(idx_multi_hog)->info());
-    }
-
-    // Allocate intermediate tensors
-    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
-    {
-        _hog_space[i].allocator()->allocate();
-    }
-
-    detection_window_strides->map(CLScheduler::get().queue(), true);
-
-    // Configure HOG detector kernel
-    for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
-    {
-        const size_t idx_block_norm = input_hog_detect[i];
-
-        _hog_detect_kernel[i].configure(compile_context, &_hog_norm_space[idx_block_norm], multi_hog->cl_model(i), detection_windows, detection_window_strides->at(i), threshold, i);
-    }
-
-    detection_window_strides->unmap(CLScheduler::get().queue());
-
-    // Configure non maxima suppression kernel
-    _non_maxima_kernel.configure(_detection_windows, min_distance);
-
-    // Allocate intermediate tensors
-    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
-    {
-        _hog_norm_space[i].allocator()->allocate();
-    }
-}
-
-void CLHOGMultiDetection::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Reset detection window
-    _detection_windows->clear();
-
-    // Run gradient
-    _gradient_kernel.run();
-
-    // Run orientation binning kernel
-    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
-    {
-        CLScheduler::get().enqueue(*_orient_bin_kernel[i], false);
-    }
-
-    // Run block normalization kernel
-    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
-    {
-        CLScheduler::get().enqueue(*_block_norm_kernel[i], false);
-    }
-
-    // Run HOG detector kernel
-    for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
-    {
-        _hog_detect_kernel[i].run();
-    }
-
-    // Run non-maxima suppression kernel if enabled
-    if(_non_maxima_suppression)
-    {
-        // Map detection windows array before computing non maxima suppression
-        _detection_windows->map(CLScheduler::get().queue(), true);
-        Scheduler::get().schedule(&_non_maxima_kernel, Window::DimY);
-        _detection_windows->unmap(CLScheduler::get().queue());
-    }
-}
diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp
deleted file mode 100644
index 37f428c..0000000
--- a/src/runtime/CL/functions/CLHarrisCorners.cpp
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLHarrisCorners.h"
-
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
-#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
-#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
-#include "arm_compute/runtime/ITensorAllocator.h"
-#include "arm_compute/runtime/Scheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLHarrisCornersKernel.h"
-#include "src/core/CL/kernels/CLSobel5x5Kernel.h"
-#include "src/core/CL/kernels/CLSobel7x7Kernel.h"
-
-#include <cmath>
-#include <utility>
-
-using namespace arm_compute;
-
-CLHarrisCorners::CLHarrisCorners(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _sobel(nullptr),
-      _harris_score(std::make_unique<CLHarrisScoreKernel>()),
-      _non_max_suppr(),
-      _candidates(),
-      _sort_euclidean(),
-      _border_gx(std::make_unique<CLFillBorderKernel>()),
-      _border_gy(std::make_unique<CLFillBorderKernel>()),
-      _gx(),
-      _gy(),
-      _score(),
-      _nonmax(),
-      _corners_list(),
-      _num_corner_candidates(0),
-      _corners(nullptr)
-{
-}
-
-CLHarrisCorners::~CLHarrisCorners() = default;
-
-void CLHarrisCorners::configure(ICLImage *input, float threshold, float min_dist,
-                                float sensitivity, int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners,
-                                BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, threshold, min_dist, sensitivity, gradient_size, block_size, corners, border_mode, constant_border_value, use_fp16);
-}
-
-void CLHarrisCorners::configure(const CLCompileContext &compile_context, ICLImage *input, float threshold, float min_dist,
-                                float sensitivity, int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners,
-                                BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
-{
-    ARM_COMPUTE_UNUSED(use_fp16); //TODO(COMPMID-772): Add half float support
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7));
-    ARM_COMPUTE_ERROR_ON(nullptr == corners);
-
-    _corners = corners;
-
-    const TensorShape shape = input->info()->tensor_shape();
-    const DataType    dt    = (gradient_size < 7) ? DataType::S16 : DataType::S32;
-    TensorInfo        tensor_info(shape, 1, dt);
-
-    _gx.allocator()->init(tensor_info);
-    _gy.allocator()->init(tensor_info);
-
-    TensorInfo info_f32(shape, 1, DataType::F32);
-    _score.allocator()->init(info_f32);
-    _nonmax.allocator()->init(info_f32);
-
-    _corners_list.resize(shape.x() * shape.y());
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_gx);
-    _memory_group.manage(&_gy);
-
-    /* Set/init Sobel kernel accordingly with gradient_size */
-    switch(gradient_size)
-    {
-        case 3:
-        {
-            auto k = std::make_unique<CLSobel3x3>();
-            k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
-            _sobel = std::move(k);
-            break;
-        }
-        case 5:
-        {
-            auto k = std::make_unique<CLSobel5x5>();
-            k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
-            _sobel = std::move(k);
-            break;
-        }
-        case 7:
-        {
-            auto k = std::make_unique<CLSobel7x7>();
-            k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
-            _sobel = std::move(k);
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Gradient size not implemented");
-    }
-
-    // Normalization factor
-    const float norm_factor               = 1.0f / (255.0f * pow(4.0f, gradient_size / 2) * block_size);
-    const float pow4_normalization_factor = pow(norm_factor, 4);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_score);
-
-    // Set/init Harris Score kernel accordingly with block_size
-    _harris_score->configure(compile_context, &_gx, &_gy, &_score, block_size, pow4_normalization_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
-
-    // Configure border filling using harris score kernel's block size
-    _border_gx->configure(compile_context, &_gx, _harris_score->border_size(), border_mode, PixelValue(constant_border_value));
-    _border_gy->configure(compile_context, &_gy, _harris_score->border_size(), border_mode, PixelValue(constant_border_value));
-
-    // Allocate intermediate buffers
-    _gx.allocator()->allocate();
-    _gy.allocator()->allocate();
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_nonmax);
-
-    // Init non-maxima suppression function
-    _non_max_suppr.configure(compile_context, &_score, &_nonmax, border_mode);
-
-    // Allocate intermediate buffers
-    _score.allocator()->allocate();
-
-    // Init corner candidates kernel
-    _candidates.configure(&_nonmax, _corners_list.data(), &_num_corner_candidates);
-
-    // Allocate intermediate buffers
-    _nonmax.allocator()->allocate();
-
-    // Init euclidean distance
-    _sort_euclidean.configure(_corners_list.data(), _corners, &_num_corner_candidates, min_dist);
-}
-
-void CLHarrisCorners::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Init to 0 number of corner candidates
-    _num_corner_candidates = 0;
-
-    // Run Sobel kernel
-    _sobel->run();
-
-    // Fill border before harris score kernel
-    CLScheduler::get().enqueue(*_border_gx, false);
-    CLScheduler::get().enqueue(*_border_gy, false);
-
-    // Run harris score kernel
-    CLScheduler::get().enqueue(*_harris_score, false);
-
-    // Run non-maxima suppression
-    _non_max_suppr.run();
-
-    // Run corner candidate kernel
-    _nonmax.map(true);
-    Scheduler::get().schedule(&_candidates, Window::DimY);
-    _nonmax.unmap();
-
-    _corners->map(CLScheduler::get().queue(), true);
-    Scheduler::get().schedule(&_sort_euclidean, Window::DimY);
-    _corners->unmap(CLScheduler::get().queue());
-}
diff --git a/src/runtime/CL/functions/CLHistogram.cpp b/src/runtime/CL/functions/CLHistogram.cpp
deleted file mode 100644
index f278cf0..0000000
--- a/src/runtime/CL/functions/CLHistogram.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLHistogram.h"
-
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-using namespace arm_compute;
-
-CLHistogram::CLHistogram()
-    : _kernel(), _kernel_border()
-{
-}
-
-void CLHistogram::configure(const ICLImage *input, ICLDistribution1D *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLHistogram::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output)
-{
-    _kernel.configure(compile_context, input, output);
-    _kernel_border.configure(compile_context, input, output);
-}
-
-void CLHistogram::run()
-{
-    CLScheduler::get().enqueue(_kernel, false);
-    CLScheduler::get().enqueue(_kernel_border);
-}
diff --git a/src/runtime/CL/functions/CLIntegralImage.cpp b/src/runtime/CL/functions/CLIntegralImage.cpp
deleted file mode 100644
index 56a151a..0000000
--- a/src/runtime/CL/functions/CLIntegralImage.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLIntegralImage.h"
-
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLIntegralImageKernel.h"
-
-using namespace arm_compute;
-
-CLIntegralImage::CLIntegralImage()
-    : _integral_hor(std::make_unique<CLIntegralImageHorKernel>()),
-      _integral_vert(std::make_unique<CLIntegralImageVertKernel>())
-{
-}
-
-CLIntegralImage::~CLIntegralImage() = default;
-
-void CLIntegralImage::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLIntegralImage::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    _integral_hor->configure(compile_context, input, output);
-    _integral_vert->configure(compile_context, output);
-}
-
-void CLIntegralImage::run()
-{
-    CLScheduler::get().enqueue(*_integral_hor, false);
-    CLScheduler::get().enqueue(*_integral_vert);
-}
diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
deleted file mode 100644
index 1ad19e5..0000000
--- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLLaplacianPyramid.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/IPyramid.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h"
-#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
-#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
-#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLGaussian5x5Kernel.h"
-#include "src/core/CL/kernels/CLGaussianPyramidKernel.h"
-
-using namespace arm_compute;
-
-CLLaplacianPyramid::CLLaplacianPyramid() // NOLINT
-    : _num_levels(0),
-      _gaussian_pyr_function(),
-      _convf(),
-      _subf(),
-      _depth_function(),
-      _gauss_pyr(),
-      _conv_pyr()
-{
-}
-
-void CLLaplacianPyramid::configure(ICLTensor *input, CLPyramid *pyramid, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, pyramid, output, border_mode, constant_border_value);
-}
-
-void CLLaplacianPyramid::configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16);
-    ARM_COMPUTE_ERROR_ON(0 == pyramid->info()->num_levels());
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(0));
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(1));
-
-    _num_levels = pyramid->info()->num_levels();
-
-    // Create and initialize the gaussian pyramid and the convoluted pyramid
-    PyramidInfo pyramid_info;
-    pyramid_info.init(_num_levels, 0.5f, pyramid->info()->tensor_shape(), arm_compute::Format::U8);
-
-    _gauss_pyr.init(pyramid_info);
-    _conv_pyr.init(pyramid_info);
-
-    // Create Gaussian Pyramid function
-    _gaussian_pyr_function.configure(compile_context, input, &_gauss_pyr, border_mode, constant_border_value);
-
-    _convf.resize(_num_levels);
-    _subf.resize(_num_levels);
-
-    for(unsigned int i = 0; i < _num_levels; ++i)
-    {
-        _convf[i].configure(compile_context, _gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), border_mode, constant_border_value);
-        _subf[i].configure(compile_context, _gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), pyramid->get_pyramid_level(i), ConvertPolicy::WRAP);
-    }
-
-    _depth_function.configure(compile_context, _conv_pyr.get_pyramid_level(_num_levels - 1), output, ConvertPolicy::WRAP, 0);
-
-    _gauss_pyr.allocate();
-    _conv_pyr.allocate();
-}
-
-void CLLaplacianPyramid::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(0 == _num_levels, "Unconfigured function");
-
-    _gaussian_pyr_function.run(); // compute gaussian pyramid
-
-    for(unsigned int i = 0; i < _num_levels; ++i)
-    {
-        _convf[i].run(); // convolute gaussian pyramid
-    }
-
-    for(unsigned int i = 0; i < _num_levels; ++i)
-    {
-        _subf[i].run(); // compute laplacian image
-    }
-
-    _depth_function.run();
-}
diff --git a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
deleted file mode 100644
index d7fd817..0000000
--- a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/IPyramid.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-
-#include <cstddef>
-
-using namespace arm_compute;
-
-CLLaplacianReconstruct::CLLaplacianReconstruct() // NOLINT
-    : _tmp_pyr(),
-      _addf(),
-      _scalef(),
-      _depthf()
-{
-}
-
-void CLLaplacianReconstruct::configure(const CLPyramid *pyramid, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), pyramid, input, output, border_mode, constant_border_value);
-}
-
-void CLLaplacianReconstruct::configure(const CLCompileContext &compile_context, const CLPyramid *pyramid, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
-    ARM_COMPUTE_ERROR_ON(input == output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
-    ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != pyramid->get_pyramid_level(0)->info()->dimension(0));
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != pyramid->get_pyramid_level(0)->info()->dimension(1));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(0));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(1));
-
-    const size_t num_levels = pyramid->info()->num_levels();
-
-    // Create and initialize the tmp pyramid: I(n-2) = upsample( input + Laplace(n-1) )
-    PyramidInfo pyramid_info;
-    pyramid_info.init(num_levels, 0.5f, output->info()->tensor_shape(), arm_compute::Format::S16);
-    _tmp_pyr.init(pyramid_info);
-
-    // Allocate add and scale functions. Level 0 does not need to be scaled.
-    _addf.resize(num_levels);
-    _scalef.resize(num_levels - 1);
-
-    const size_t last_level = num_levels - 1;
-
-    _addf[last_level].configure(compile_context, input, pyramid->get_pyramid_level(last_level), _tmp_pyr.get_pyramid_level(last_level), ConvertPolicy::SATURATE);
-
-    // Scale levels n-1 to 1, and add levels n-2 to 0
-    for(size_t l = 0; l < last_level; ++l)
-    {
-        _scalef[l].configure(compile_context, _tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), ScaleKernelInfo{ arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value });
-        _addf[l].configure(compile_context, _tmp_pyr.get_pyramid_level(l), pyramid->get_pyramid_level(l), _tmp_pyr.get_pyramid_level(l), ConvertPolicy::SATURATE);
-    }
-
-    // Convert level 0 from S16 to U8
-    _depthf.configure(compile_context, _tmp_pyr.get_pyramid_level(0), output, ConvertPolicy::SATURATE, 0);
-
-    _tmp_pyr.allocate();
-}
-
-void CLLaplacianReconstruct::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_addf.empty(), "Unconfigured function");
-
-    const size_t last_level = _tmp_pyr.info()->num_levels() - 1;
-
-    _addf[last_level].run();
-
-    // Run l = [last_level - 1, 0]
-    for(size_t l = last_level; l-- > 0;)
-    {
-        _scalef[l].run();
-        _addf[l].run();
-    }
-
-    _depthf.run();
-}
diff --git a/src/runtime/CL/functions/CLMagnitude.cpp b/src/runtime/CL/functions/CLMagnitude.cpp
deleted file mode 100644
index 0599a11..0000000
--- a/src/runtime/CL/functions/CLMagnitude.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLMagnitude.h"
-
-#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLMagnitude::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, mag_type);
-}
-
-void CLMagnitude::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type)
-{
-    auto k = std::make_unique<CLMagnitudePhaseKernel>();
-    k->configure(compile_context, input1, input2, output, nullptr, mag_type);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/CL/functions/CLMeanStdDev.cpp b/src/runtime/CL/functions/CLMeanStdDev.cpp
deleted file mode 100644
index d8cd41d..0000000
--- a/src/runtime/CL/functions/CLMeanStdDev.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/TensorInfo.h"
-
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/functions/CLMeanStdDev.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLMeanStdDevKernel.h"
-#include "src/core/CL/kernels/CLReductionOperationKernel.h"
-
-using namespace arm_compute;
-
-CLMeanStdDev::CLMeanStdDev(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _data_type(),
-      _num_pixels(),
-      _run_stddev(),
-      _reduction_operation_mean(),
-      _reduction_operation_stddev(),
-      _reduction_output_mean(),
-      _reduction_output_stddev(),
-      _mean(nullptr),
-      _stddev(nullptr),
-      _mean_stddev_kernel(std::make_unique<CLMeanStdDevKernel>()),
-      _fill_border_kernel(std::make_unique<CLFillBorderKernel>()),
-      _global_sum(),
-      _global_sum_squared()
-{
-}
-
-CLMeanStdDev::~CLMeanStdDev() = default;
-
-Status CLMeanStdDev::validate(ITensorInfo *input, float *mean, float *stddev)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_TENSOR_NOT_2D(input);
-    if(is_data_type_float(input->data_type()))
-    {
-        ARM_COMPUTE_UNUSED(mean);
-        ARM_COMPUTE_UNUSED(stddev);
-
-        TensorShape output_shape      = TensorShape{ 1, input->dimension(1) };
-        TensorInfo  output_shape_info = TensorInfo(output_shape, 1, DataType::U8);
-        return CLReductionOperation::validate(input, &output_shape_info, 0, ReductionOperation::SUM);
-    }
-    else
-    {
-        return CLMeanStdDevKernel::validate(input, mean, nullptr, stddev, nullptr);
-    }
-}
-
-void CLMeanStdDev::configure(ICLImage *input, float *mean, float *stddev)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, mean, stddev);
-}
-
-void CLMeanStdDev::configure(const CLCompileContext &compile_context, ICLImage *input, float *mean, float *stddev)
-{
-    // In the case of F16/F32 we call reduction operation for calculating CLMeanStdDev
-    _data_type = input->info()->data_type();
-
-    if(is_data_type_float(_data_type))
-    {
-        _num_pixels = input->info()->dimension(0) * input->info()->dimension(1);
-
-        _memory_group.manage(&_reduction_output_mean);
-        _reduction_operation_mean.configure(compile_context, input, &_reduction_output_mean, 0, ReductionOperation::SUM);
-        _reduction_output_mean.allocator()->allocate();
-        _mean = mean;
-
-        if(stddev != nullptr)
-        {
-            _memory_group.manage(&_reduction_output_stddev);
-            _reduction_operation_stddev.configure(compile_context, input, &_reduction_output_stddev, 0, ReductionOperation::SUM_SQUARE);
-            _reduction_output_stddev.allocator()->allocate();
-            _stddev     = stddev;
-            _run_stddev = true;
-        }
-    }
-    else
-    {
-        _global_sum = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong));
-
-        if(stddev != nullptr)
-        {
-            _global_sum_squared = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong));
-        }
-
-        _mean_stddev_kernel->configure(compile_context, input, mean, &_global_sum, stddev, &_global_sum_squared);
-        _fill_border_kernel->configure(compile_context, input, _mean_stddev_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint8_t>(0)));
-    }
-}
-
-template <typename T>
-void CLMeanStdDev::run_float()
-{
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Perform reduction on x-axis
-    _reduction_operation_mean.run();
-    if(_run_stddev)
-    {
-        _reduction_operation_stddev.run();
-        _reduction_output_stddev.map(true);
-    }
-
-    _reduction_output_mean.map(true);
-
-    auto mean = static_cast<T>(0);
-
-    // Calculate final result for mean
-    for(unsigned int i = 0; i < _reduction_output_mean.info()->dimension(1); ++i)
-    {
-        mean += *reinterpret_cast<T *>(_reduction_output_mean.buffer() + _reduction_output_mean.info()->offset_element_in_bytes(Coordinates(0, i)));
-    }
-
-    mean /= _num_pixels;
-    *_mean = mean;
-
-    if(_run_stddev)
-    {
-        auto stddev = static_cast<T>(0);
-        // Calculate final result for stddev
-        for(unsigned int i = 0; i < _reduction_output_stddev.info()->dimension(1); ++i)
-        {
-            stddev += *reinterpret_cast<T *>(_reduction_output_stddev.buffer() + _reduction_output_stddev.info()->offset_element_in_bytes(Coordinates(0, i)));
-        }
-        *_stddev = std::sqrt((stddev / _num_pixels) - (mean * mean));
-
-        _reduction_output_stddev.unmap();
-    }
-    _reduction_output_mean.unmap();
-}
-
-void CLMeanStdDev::run_int()
-{
-    CLScheduler::get().enqueue(*_fill_border_kernel);
-    CLScheduler::get().enqueue(*_mean_stddev_kernel);
-}
-
-void CLMeanStdDev::run()
-{
-    switch(_data_type)
-    {
-        case DataType::F16:
-            run_float<half>();
-            break;
-        case DataType::F32:
-            run_float<float>();
-            break;
-        case DataType::U8:
-            run_int();
-            break;
-        default:
-            ARM_COMPUTE_ERROR_ON("Not supported");
-    }
-}
diff --git a/src/runtime/CL/functions/CLMedian3x3.cpp b/src/runtime/CL/functions/CLMedian3x3.cpp
deleted file mode 100644
index b32063a..0000000
--- a/src/runtime/CL/functions/CLMedian3x3.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLMedian3x3.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLMedian3x3Kernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLMedian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
-}
-
-void CLMedian3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = std::make_unique<CLMedian3x3Kernel>();
-    k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/CL/functions/CLMinMaxLocation.cpp b/src/runtime/CL/functions/CLMinMaxLocation.cpp
deleted file mode 100644
index ace6a1c..0000000
--- a/src/runtime/CL/functions/CLMinMaxLocation.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLMinMaxLocation.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "src/core/CL/kernels/CLMinMaxLocationKernel.h"
-
-namespace arm_compute
-{
-CLMinMaxLocation::CLMinMaxLocation()
-    : _min_max_kernel(std::make_unique<CLMinMaxKernel>()),
-      _min_max_loc_kernel(std::make_unique<CLMinMaxLocationKernel>()),
-      _min_max_vals(),
-      _min_max_count_vals(),
-      _min(nullptr),
-      _max(nullptr),
-      _min_count(nullptr),
-      _max_count(nullptr),
-      _min_loc(nullptr),
-      _max_loc(nullptr)
-{
-}
-
-CLMinMaxLocation::~CLMinMaxLocation() = default;
-
-void CLMinMaxLocation::configure(const ICLImage *input, void *min, void *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, min, max, min_loc, max_loc, min_count, max_count);
-}
-
-void CLMinMaxLocation::configure(const CLCompileContext &compile_context, const ICLImage *input, void *min, void *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc,
-                                 uint32_t *min_count,
-                                 uint32_t *max_count)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == min);
-    ARM_COMPUTE_ERROR_ON(nullptr == max);
-
-    _min_max_vals       = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 2 * sizeof(int32_t));
-    _min_max_count_vals = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 2 * sizeof(uint32_t));
-    _min                = min;
-    _max                = max;
-    _min_count          = min_count;
-    _max_count          = max_count;
-    _min_loc            = min_loc;
-    _max_loc            = max_loc;
-
-    _min_max_kernel->configure(compile_context, input, &_min_max_vals);
-    _min_max_loc_kernel->configure(compile_context, input, &_min_max_vals, &_min_max_count_vals, _min_loc, _max_loc);
-}
-
-void CLMinMaxLocation::run()
-{
-    cl::CommandQueue q = CLScheduler::get().queue();
-
-    CLScheduler::get().enqueue(*_min_max_kernel, false);
-    CLScheduler::get().enqueue(*_min_max_loc_kernel, false);
-
-    // Update min and max
-    q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 0 * sizeof(int32_t), sizeof(int32_t), static_cast<int32_t *>(_min));
-    q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 1 * sizeof(int32_t), sizeof(int32_t), static_cast<int32_t *>(_max));
-
-    // Update min and max count
-    if(_min_count != nullptr)
-    {
-        q.enqueueReadBuffer(_min_max_count_vals, CL_FALSE, 0 * sizeof(uint32_t), sizeof(uint32_t), _min_count);
-    }
-    if(_max_count != nullptr)
-    {
-        q.enqueueReadBuffer(_min_max_count_vals, CL_FALSE, 1 * sizeof(uint32_t), sizeof(uint32_t), _max_count);
-    }
-
-    // Update min/max point arrays (Makes the kernel blocking)
-    if(_min_loc != nullptr)
-    {
-        unsigned int min_count = 0;
-        q.enqueueReadBuffer(_min_max_count_vals, CL_TRUE, 0 * sizeof(uint32_t), sizeof(uint32_t), &min_count);
-        size_t min_corner_size = std::min(static_cast<size_t>(min_count), _min_loc->max_num_values());
-        _min_loc->resize(min_corner_size);
-    }
-    if(_max_loc != nullptr)
-    {
-        unsigned int max_count = 0;
-        q.enqueueReadBuffer(_min_max_count_vals, CL_TRUE, 1 * sizeof(uint32_t), sizeof(uint32_t), &max_count);
-        size_t max_corner_size = std::min(static_cast<size_t>(max_count), _max_loc->max_num_values());
-        _max_loc->resize(max_corner_size);
-    }
-}
-} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLNonLinearFilter.cpp b/src/runtime/CL/functions/CLNonLinearFilter.cpp
deleted file mode 100644
index ec88f87..0000000
--- a/src/runtime/CL/functions/CLNonLinearFilter.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLNonLinearFilter.h"
-
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLNonLinearFilterKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLNonLinearFilter::configure(ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
-                                  BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, function, mask_size, pattern, mask, border_mode, constant_border_value);
-}
-
-void CLNonLinearFilter::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern,
-                                  const uint8_t *mask, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = std::make_unique<CLNonLinearFilterKernel>();
-    k->configure(compile_context, input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
deleted file mode 100644
index 5906ea5..0000000
--- a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h"
-
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLNonMaximaSuppression3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode);
-}
-
-void CLNonMaximaSuppression3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode)
-{
-    auto k = std::make_unique<CLNonMaximaSuppression3x3Kernel>();
-    k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-
-    if(border_mode != BorderMode::UNDEFINED)
-    {
-        _border_handler->configure(compile_context, input, _kernel->border_size(), BorderMode::CONSTANT);
-    }
-    else
-    {
-        _border_handler->configure(compile_context, input, _kernel->border_size(), BorderMode::UNDEFINED);
-    }
-}
diff --git a/src/runtime/CL/functions/CLOpticalFlow.cpp b/src/runtime/CL/functions/CLOpticalFlow.cpp
deleted file mode 100644
index 76e0ac5..0000000
--- a/src/runtime/CL/functions/CLOpticalFlow.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLOpticalFlow.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/runtime/CL/CLPyramid.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLLKTrackerKernel.h"
-
-using namespace arm_compute;
-
-CLOpticalFlow::CLOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _tracker_init_kernel(),
-      _tracker_stage0_kernel(),
-      _tracker_stage1_kernel(),
-      _tracker_finalize_kernel(std::make_unique<CLLKTrackerFinalizeKernel>()),
-      _func_scharr(),
-      _scharr_gx(),
-      _scharr_gy(),
-      _old_points(nullptr),
-      _new_points_estimates(nullptr),
-      _new_points(nullptr),
-      _old_points_internal(),
-      _new_points_internal(),
-      _coefficient_table(),
-      _old_values(),
-      _num_levels(0)
-{
-}
-
-CLOpticalFlow::~CLOpticalFlow() = default;
-
-void CLOpticalFlow::configure(const CLPyramid *old_pyramid, const CLPyramid *new_pyramid,
-                              const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points,
-                              Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate,
-                              BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), old_pyramid, new_pyramid, old_points, new_points_estimates, new_points, termination, epsilon, num_iterations, window_dimension,
-              use_initial_estimate, border_mode, constant_border_value);
-}
-
-void CLOpticalFlow::configure(const CLCompileContext &compile_context, const CLPyramid *old_pyramid, const CLPyramid *new_pyramid,
-                              const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points,
-                              Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate,
-                              BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == old_pyramid);
-    ARM_COMPUTE_ERROR_ON(nullptr == new_pyramid);
-    ARM_COMPUTE_ERROR_ON(nullptr == old_points);
-    ARM_COMPUTE_ERROR_ON(nullptr == new_points_estimates);
-    ARM_COMPUTE_ERROR_ON(nullptr == new_points);
-    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->num_levels() != new_pyramid->info()->num_levels());
-    ARM_COMPUTE_ERROR_ON(0 == old_pyramid->info()->num_levels());
-    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->width() != new_pyramid->info()->width());
-    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->height() != new_pyramid->info()->height());
-    ARM_COMPUTE_ERROR_ON(use_initial_estimate && old_points->num_values() != new_points_estimates->num_values());
-
-    // Set member variables
-    _old_points           = old_points;
-    _new_points_estimates = new_points_estimates;
-    _new_points           = new_points;
-    _num_levels           = old_pyramid->info()->num_levels();
-
-    const float pyr_scale              = old_pyramid->info()->scale();
-    const int   list_length            = old_points->num_values();
-    const int   old_values_list_length = list_length * window_dimension * window_dimension;
-
-    // Create kernels and tensors
-    _tracker_init_kernel.reserve(_num_levels);
-    _tracker_stage0_kernel.reserve(_num_levels);
-    _tracker_stage1_kernel.reserve(_num_levels);
-    _func_scharr.resize(_num_levels);
-    _scharr_gx.resize(_num_levels);
-    _scharr_gy.resize(_num_levels);
-
-    // Create internal keypoint arrays
-    _old_points_internal = std::make_unique<CLLKInternalKeypointArray>(list_length);
-    _old_points_internal->resize(list_length);
-    _new_points_internal = std::make_unique<CLLKInternalKeypointArray>(list_length);
-    _new_points_internal->resize(list_length);
-    _coefficient_table = std::make_unique<CLCoefficientTableArray>(list_length);
-    _coefficient_table->resize(list_length);
-    _old_values = std::make_unique<CLOldValueArray>(old_values_list_length);
-    _old_values->resize(old_values_list_length);
-    _new_points->resize(list_length);
-
-    for(size_t i = 0; i < _num_levels; ++i)
-    {
-        // Get images from the ith level of old and right pyramid
-        ICLImage *old_ith_input = old_pyramid->get_pyramid_level(i);
-        ICLImage *new_ith_input = new_pyramid->get_pyramid_level(i);
-
-        // Get width and height of images
-        const unsigned int width_ith  = old_ith_input->info()->dimension(0);
-        const unsigned int height_ith = new_ith_input->info()->dimension(1);
-
-        // Initialize Scharr tensors
-        TensorInfo tensor_info(TensorShape(width_ith, height_ith), 1, DataType::S16);
-        _scharr_gx[i].allocator()->init(tensor_info);
-        _scharr_gy[i].allocator()->init(tensor_info);
-
-        // Manage intermediate buffers
-        _memory_group.manage(&_scharr_gx[i]);
-        _memory_group.manage(&_scharr_gy[i]);
-
-        // Init Scharr kernel
-        _func_scharr[i].configure(compile_context, old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value);
-
-        // Init Lucas-Kanade init kernel
-        _tracker_init_kernel.emplace_back(std::make_unique<CLLKTrackerInitKernel>());
-        _tracker_init_kernel.back()->configure(compile_context, old_points, new_points_estimates, _old_points_internal.get(), _new_points_internal.get(), use_initial_estimate, i, _num_levels, pyr_scale);
-
-        // Init Lucas-Kanade stage0 kernel
-        _tracker_stage0_kernel.emplace_back(std::make_unique<CLLKTrackerStage0Kernel>());
-        _tracker_stage0_kernel.back()->configure(compile_context, old_ith_input, &_scharr_gx[i], &_scharr_gy[i],
-                                                 _old_points_internal.get(), _new_points_internal.get(), _coefficient_table.get(), _old_values.get(),
-                                                 window_dimension, i);
-
-        // Init Lucas-Kanade stage1 kernel
-        _tracker_stage1_kernel.emplace_back(std::make_unique<CLLKTrackerStage1Kernel>());
-        _tracker_stage1_kernel.back()->configure(compile_context, new_ith_input, _new_points_internal.get(), _coefficient_table.get(), _old_values.get(),
-                                                 termination, epsilon, num_iterations, window_dimension, i);
-
-        // Allocate intermediate buffers
-        _scharr_gx[i].allocator()->allocate();
-        _scharr_gy[i].allocator()->allocate();
-    }
-
-    // Finalize Lucas-Kanade
-    _tracker_finalize_kernel->configure(compile_context, _new_points_internal.get(), new_points);
-}
-
-void CLOpticalFlow::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    for(unsigned int level = _num_levels; level > 0; --level)
-    {
-        // Run Scharr kernel
-        _func_scharr[level - 1].run();
-
-        // Run Lucas-Kanade init kernel
-        CLScheduler::get().enqueue(*_tracker_init_kernel[level - 1]);
-
-        // Run Lucas-Kanade stage0 kernel
-        CLScheduler::get().enqueue(*_tracker_stage0_kernel[level - 1]);
-
-        // Run Lucas-Kanade stage1 kernel
-        CLScheduler::get().enqueue(*_tracker_stage1_kernel[level - 1]);
-    }
-
-    CLScheduler::get().enqueue(*_tracker_finalize_kernel, true);
-}
diff --git a/src/runtime/CL/functions/CLPhase.cpp b/src/runtime/CL/functions/CLPhase.cpp
deleted file mode 100644
index b2ff5d0..0000000
--- a/src/runtime/CL/functions/CLPhase.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLPhase.h"
-
-#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLPhase::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, phase_type);
-}
-
-void CLPhase::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type)
-{
-    auto k = std::make_unique<CLMagnitudePhaseKernel>();
-    k->configure(compile_context, input1, input2, nullptr, output, MagnitudeType::L1NORM, phase_type);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/CL/functions/CLScharr3x3.cpp b/src/runtime/CL/functions/CLScharr3x3.cpp
deleted file mode 100644
index 563ec19..0000000
--- a/src/runtime/CL/functions/CLScharr3x3.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLScharr3x3Kernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLScharr3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
-}
-
-void CLScharr3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = std::make_unique<CLScharr3x3Kernel>();
-    k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/CL/functions/CLSobel3x3.cpp b/src/runtime/CL/functions/CLSobel3x3.cpp
deleted file mode 100644
index 6724c12..0000000
--- a/src/runtime/CL/functions/CLSobel3x3.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLSobel3x3Kernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-CLSobel3x3::~CLSobel3x3() = default;
-
-void CLSobel3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
-}
-
-void CLSobel3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = std::make_unique<CLSobel3x3Kernel>();
-    k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/CL/functions/CLSobel5x5.cpp b/src/runtime/CL/functions/CLSobel5x5.cpp
deleted file mode 100644
index 98f2157..0000000
--- a/src/runtime/CL/functions/CLSobel5x5.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/ITensorAllocator.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLSobel5x5Kernel.h"
-
-using namespace arm_compute;
-
-CLSobel5x5::CLSobel5x5(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)),
-      _sobel_hor(std::make_unique<CLSobel5x5HorKernel>()),
-      _sobel_vert(std::make_unique<CLSobel5x5VertKernel>()),
-      _border_handler(std::make_unique<CLFillBorderKernel>()),
-      _tmp_x(),
-      _tmp_y()
-{
-}
-
-CLSobel5x5::~CLSobel5x5() = default;
-
-void CLSobel5x5::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
-}
-
-void CLSobel5x5::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-
-    const bool run_sobel_x = output_x != nullptr;
-    const bool run_sobel_y = output_y != nullptr;
-
-    TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::S16);
-
-    if(run_sobel_x && run_sobel_y)
-    {
-        _tmp_x.allocator()->init(tensor_info);
-        _tmp_y.allocator()->init(tensor_info);
-        _memory_group.manage(&_tmp_x);
-        _memory_group.manage(&_tmp_y);
-        _sobel_hor->configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert->configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
-        _tmp_x.allocator()->allocate();
-        _tmp_y.allocator()->allocate();
-    }
-    else if(run_sobel_x)
-    {
-        _tmp_x.allocator()->init(tensor_info);
-        _memory_group.manage(&_tmp_x);
-        _sobel_hor->configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert->configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
-        _tmp_x.allocator()->allocate();
-    }
-    else if(run_sobel_y)
-    {
-        _tmp_y.allocator()->init(tensor_info);
-        _memory_group.manage(&_tmp_y);
-        _sobel_hor->configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert->configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
-        _tmp_y.allocator()->allocate();
-    }
-    _border_handler->configure(compile_context, input, _sobel_hor->border_size(), border_mode, PixelValue(constant_border_value));
-}
-
-void CLSobel5x5::run()
-{
-    CLScheduler::get().enqueue(*_border_handler, false);
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    CLScheduler::get().enqueue(*_sobel_hor, false);
-    CLScheduler::get().enqueue(*_sobel_vert);
-}
diff --git a/src/runtime/CL/functions/CLSobel7x7.cpp b/src/runtime/CL/functions/CLSobel7x7.cpp
deleted file mode 100644
index a3d63f9..0000000
--- a/src/runtime/CL/functions/CLSobel7x7.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/ITensorAllocator.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLSobel7x7Kernel.h"
-
-using namespace arm_compute;
-
-CLSobel7x7::CLSobel7x7(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)),
-      _sobel_hor(std::make_unique<CLSobel7x7HorKernel>()),
-      _sobel_vert(std::make_unique<CLSobel7x7VertKernel>()),
-      _border_handler(std::make_unique<CLFillBorderKernel>()),
-      _tmp_x(),
-      _tmp_y()
-{
-}
-
-CLSobel7x7::~CLSobel7x7() = default;
-
-void CLSobel7x7::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
-}
-
-void CLSobel7x7::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-
-    const bool run_sobel_x = output_x != nullptr;
-    const bool run_sobel_y = output_y != nullptr;
-
-    TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::S32);
-
-    if(run_sobel_x && run_sobel_y)
-    {
-        _tmp_x.allocator()->init(tensor_info);
-        _tmp_y.allocator()->init(tensor_info);
-        _memory_group.manage(&_tmp_x);
-        _memory_group.manage(&_tmp_y);
-        _sobel_hor->configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert->configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
-        _tmp_x.allocator()->allocate();
-        _tmp_y.allocator()->allocate();
-    }
-    else if(run_sobel_x)
-    {
-        _tmp_x.allocator()->init(tensor_info);
-        _memory_group.manage(&_tmp_x);
-        _sobel_hor->configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert->configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
-        _tmp_x.allocator()->allocate();
-    }
-    else if(run_sobel_y)
-    {
-        _tmp_y.allocator()->init(tensor_info);
-        _memory_group.manage(&_tmp_y);
-        _sobel_hor->configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert->configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
-        _tmp_y.allocator()->allocate();
-    }
-    _border_handler->configure(compile_context, input, _sobel_hor->border_size(), border_mode, PixelValue(constant_border_value));
-}
-
-void CLSobel7x7::run()
-{
-    CLScheduler::get().enqueue(*_border_handler, false);
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    CLScheduler::get().enqueue(*_sobel_hor, false);
-    CLScheduler::get().enqueue(*_sobel_vert);
-}
diff --git a/src/runtime/CL/functions/CLTableLookup.cpp b/src/runtime/CL/functions/CLTableLookup.cpp
deleted file mode 100644
index a4671f5..0000000
--- a/src/runtime/CL/functions/CLTableLookup.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLTableLookup.h"
-
-#include "src/core/CL/kernels/CLTableLookupKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLTableLookup::configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, lut, output);
-}
-
-void CLTableLookup::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLLut *lut, ICLTensor *output)
-{
-    auto k = std::make_unique<CLTableLookupKernel>();
-    k->configure(compile_context, input, lut, output);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/CL/functions/CLThreshold.cpp b/src/runtime/CL/functions/CLThreshold.cpp
deleted file mode 100644
index 70bc3b9..0000000
--- a/src/runtime/CL/functions/CLThreshold.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLThreshold.h"
-
-#include "src/core/CL/kernels/CLThresholdKernel.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-void CLThreshold::configure(const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
-}
-
-void CLThreshold::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info)
-{
-    auto k = std::make_unique<CLThresholdKernel>();
-    k->configure(compile_context, input, output, info);
-    _kernel = std::move(k);
-}
-} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLWarpAffine.cpp b/src/runtime/CL/functions/CLWarpAffine.cpp
deleted file mode 100644
index 9a22446..0000000
--- a/src/runtime/CL/functions/CLWarpAffine.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLWarpAffine.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLWarpAffineKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLWarpAffine::configure(ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, matrix, policy, border_mode, constant_border_value);
-}
-
-void CLWarpAffine::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode,
-                             uint8_t constant_border_value)
-{
-    auto k = std::make_unique<CLWarpAffineKernel>();
-    k->configure(compile_context, input, output, matrix, policy);
-    _kernel = std::move(k);
-    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/CL/functions/CLWarpPerspective.cpp b/src/runtime/CL/functions/CLWarpPerspective.cpp
deleted file mode 100644
index 0ec6b42..0000000
--- a/src/runtime/CL/functions/CLWarpPerspective.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLWarpPerspective.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLWarpPerspectiveKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLWarpPerspective::configure(ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, matrix, policy, border_mode, constant_border_value);
-}
-
-void CLWarpPerspective::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode,
-                                  uint8_t constant_border_value)
-{
-    auto k = std::make_unique<CLWarpPerspectiveKernel>();
-    k->configure(compile_context, input, output, matrix, policy);
-    _kernel = std::move(k);
-    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/NEON/functions/NEConvolution.cpp b/src/runtime/NEON/functions/NEConvolution.cpp
deleted file mode 100644
index ad62a22..0000000
--- a/src/runtime/NEON/functions/NEConvolution.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEConvolution.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "src/core/NEON/kernels/NEConvolutionKernel.h"
-#include "src/core/NEON/kernels/NEConvolutionKernel.h"
-#include "src/core/NEON/kernels/NEFillBorderKernel.h"
-
-#include <array>
-#include <utility>
-
-namespace arm_compute
-{
-NEConvolution3x3::~NEConvolution3x3() = default;
-
-void NEConvolution3x3::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = std::make_unique<NEConvolution3x3Kernel>();
-    k->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-
-    auto b = std::make_unique<NEFillBorderKernel>();
-    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-    _border_handler = std::move(b);
-}
-
-template <unsigned int matrix_size>
-NEConvolutionSquare<matrix_size>::~NEConvolutionSquare() = default;
-
-template <unsigned int matrix_size>
-NEConvolutionSquare<matrix_size>::NEConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
-{
-}
-
-template <unsigned int matrix_size>
-void NEConvolutionSquare<matrix_size>::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode,
-                                                 uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON(conv == nullptr);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-
-    std::array<int16_t, matrix_size> conv_col{ { 0 } };
-    std::array<int16_t, matrix_size> conv_row{ { 0 } };
-
-    _is_separable = separate_matrix(conv, conv_col.data(), conv_row.data(), matrix_size);
-
-    auto b = std::make_unique<NEFillBorderKernel>();
-    if(_is_separable)
-    {
-        DataType intermediate_type = DataType::UNKNOWN;
-        std::tie(std::ignore, intermediate_type) = data_type_for_convolution(conv_col.data(), conv_row.data(), matrix_size);
-
-        _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, intermediate_type));
-
-        // Manage intermediate buffers
-        _memory_group.manage(&_tmp);
-
-        // Calculate scale
-        if(scale == 0)
-        {
-            scale = calculate_matrix_scale(conv, matrix_size);
-        }
-
-        _kernel_hor  = std::make_unique<NESeparableConvolutionHorKernel<matrix_size>>();
-        _kernel_vert = std::make_unique<NESeparableConvolutionVertKernel<matrix_size>>();
-
-        _kernel_hor->configure(input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED);
-        _kernel_vert->configure(&_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED);
-
-        _tmp.allocator()->allocate();
-
-        b->configure(input, _kernel_hor->border_size(), border_mode, PixelValue(constant_border_value));
-    }
-    else
-    {
-        _kernel = std::make_unique<NEConvolutionKernel<matrix_size>>();
-        _kernel->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
-        b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-    }
-    _border_handler = std::move(b);
-}
-
-template <unsigned int matrix_size>
-void                   NEConvolutionSquare<matrix_size>::run()
-{
-    NEScheduler::get().schedule(_border_handler.get(), Window::DimZ);
-
-    if(_is_separable)
-    {
-        MemoryGroupResourceScope scope_mg(_memory_group);
-
-        NEScheduler::get().schedule(_kernel_hor.get(), Window::DimY);
-        NEScheduler::get().schedule(_kernel_vert.get(), Window::DimY);
-    }
-    else
-    {
-        NEScheduler::get().schedule(_kernel.get(), Window::DimY);
-    }
-}
-
-template class arm_compute::NEConvolutionSquare<5>;
-template class arm_compute::NEConvolutionSquare<7>;
-template class arm_compute::NEConvolutionSquare<9>;
-
-NEConvolutionRectangle::~NEConvolutionRectangle() = default;
-
-void NEConvolutionRectangle::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
-{
-    border_mode = (border_mode == BorderMode::UNDEFINED) ? BorderMode::CONSTANT : border_mode;
-    auto k      = std::make_unique<NEConvolutionRectangleKernel>();
-    k->configure(input, output, conv, rows, cols, scale, false);
-    _kernel = std::move(k);
-
-    auto b = std::make_unique<NEFillBorderKernel>();
-    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-    _border_handler = std::move(b);
-}
-} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp b/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
deleted file mode 100644
index a34be71..0000000
--- a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h"
-
-#include "src/core/NEON/kernels/NEFillBorderKernel.h"
-#include "src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-void NENonMaximaSuppression3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode)
-{
-    auto k = std::make_unique<NENonMaximaSuppression3x3Kernel>();
-    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-
-    auto b = std::make_unique<NEFillBorderKernel>();
-    if(border_mode != BorderMode::UNDEFINED)
-    {
-        b->configure(input, BorderSize(1), BorderMode::CONSTANT, static_cast<float>(0.f));
-    }
-    else
-    {
-        b->configure(input, BorderSize(1), BorderMode::UNDEFINED, static_cast<float>(0.f));
-    }
-    _border_handler = std::move(b);
-}
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NERemap.cpp b/src/runtime/NEON/functions/NERemap.cpp
new file mode 100644
index 0000000..a55f7bc
--- /dev/null
+++ b/src/runtime/NEON/functions/NERemap.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NERemap.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NERemapKernel.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void NERemap::configure(ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported");
+
+    auto k = std::make_unique<NERemapKernel>();
+    k->configure(input, map_x, map_y, output, policy);
+    _kernel = std::move(k);
+
+    auto b = std::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler = std::move(b);
+}
+} // namespace arm_compute