Rename ported functions

Rename CpuPooling to CpuPool2d
Rename CpuPoolingKernel to CpuPool2dKernel
Rename CpuPoolingAssemblyWrapperKernel to CpuPool2dAssemblyWrapperKernel
Move CpuPool2dAssemblyWrapperKernel in internal subfolder
Rename CpuDepthwiseConvolutionNativeKernel to CpuDepthwiseConv2dNativeKernel
Rename CpuDepthwiseConvolutionAssemblyDispatch to CpuDepthwiseConv2dAssemblyDispatch
Rename CpuDepthwiseConvolution to CpuDepthwiseConv2d
Rename CpuDirectConvolutionKernel to CpuDirectConv2dKernel
Rename CpuDirectConvolutionOutputStageKernel to CpuDirectConv2dOutputStageKernel
Rename CpuDirectConvolution to CpuDirectConv2d
Rename ClPoolingKernel to ClPool2dKernel
Rename ClPooling to ClPool2d
Rename ClDirectConvolutionKernel to ClDirectConv2dKernel

Resolves: COMPMID-4405

Change-Id: I8e48f015e4e492a76a7512f5679cb3eb0cd028f6
Signed-off-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5708
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
index cc96cf1..45481d0 100644
--- a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
+++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
@@ -53,7 +53,7 @@
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  indices         Tensor containing the offset to store the input elements in the output tensor.
-     *                             @ref opencl::ClPooling with indices should precede this function in order to
+     *                             @ref CLPoolingLayer with indices should precede this function in order to
      *                             properly reconstruct the output tensor.
      *                             The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
      * @param[out] output          Destination tensor. Data types supported: Same as @p input.
@@ -65,7 +65,7 @@
      * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
      * @param[in] indices   TensorInfo associated to the tensor containing the offset to store the input elements in the output tensor.
-     *                      @ref opencl::ClPooling with indices should precede this function in order to
+     *                      @ref CLPoolingLayer with indices should precede this function in order to
      *                      properly reconstruct the output tensor.
      *                      The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
      * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
diff --git a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
index f422728..ecc116e 100644
--- a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
+++ b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
@@ -56,7 +56,7 @@
      *
      * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  indices   Tensor containing the offset to store the input elements in the output tensor.
-     *                       @ref cpu::kernels::CpuPoolingKernel with indices should precede this function in order to
+     *                       @ref NEPoolingLayer with indices should precede this function in order to
      *                       properly reconstruct the output tensor.
      *                       The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
      * @param[out] output    Destination tensor. Data types supported: Same as @p input.
diff --git a/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.cpp b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
similarity index 84%
rename from src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.cpp
rename to src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
index a5d1b61..4ddb35f 100644
--- a/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.cpp
+++ b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h"
+#include "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/ITensorInfo.h"
@@ -74,7 +74,7 @@
     const size_t   input_width;
     const size_t   input_depth;
 
-    DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1)
+    DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1) // NOLINT
         : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
           x_start(w.x().start()),
           x_end(w.x().end()),
@@ -110,14 +110,14 @@
 }
 
 template <typename T>
-void depthwise_loop_multiplier1_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+void depthwise_loop_multiplier1_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
                                    const Size2D &dilation, const Window &window, bool has_biases)
 {
     constexpr auto element_per_vector = vector_size / sizeof(T);
     using VectorType                  = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
     using TagType                     = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
 
-    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window);
+    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
 
     const VectorType zero_vector = wrapper::vdup_n(static_cast<T>(0), TagType{});
 
@@ -135,9 +135,9 @@
     Window win_output = window;
     win_output.set(Window::DimX, dim_manual_loop);
 
-    Iterator input_it(input, win_input);
+    Iterator input_it(src, win_input);
     Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, win_output);
+    Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
     if(has_biases)
@@ -224,10 +224,10 @@
 }
 
 template <typename T>
-void depthwise_loop_generic_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+void depthwise_loop_generic_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
                                const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases)
 {
-    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
+    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
 
     Window execution_window = window;
     execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
@@ -246,9 +246,9 @@
     Window win_output = window;
     win_output.set_dimension_step(Window::DimX, run_info.x_step);
 
-    Iterator input_it(input, win_input);
+    Iterator input_it(src, win_input);
     Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, win_output);
+    Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
     if(has_biases)
@@ -306,23 +306,24 @@
 }
 
 template <typename T, typename TW>
-void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                          const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
+void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
+                                          const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
 {
+    ARM_COMPUTE_UNUSED(output_multiplier, output_shift);
     constexpr auto element_per_vector = vector_size / sizeof(T);
     using VectorType                  = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
     using TagType                     = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
     using AccType                     = int32_t;
     using AccArrayType                = std::array<AccType, element_per_vector>;
 
-    const auto out_of_bound_value  = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();
+    const auto out_of_bound_value  = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
     const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
 
-    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window);
+    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
 
-    const int32_t input_qoffset   = input->info()->quantization_info().uniform().offset;
+    const int32_t input_qoffset   = src->info()->quantization_info().uniform().offset;
     const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
-    const int32_t output_qoffset  = output->info()->quantization_info().uniform().offset;
+    const int32_t output_qoffset  = dst->info()->quantization_info().uniform().offset;
     const int32_t k_offset        = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
 
     Window execution_window = window;
@@ -339,9 +340,9 @@
     Window win_output = window;
     win_output.set(Window::DimX, dim_manual_loop);
 
-    Iterator input_it(input, win_input);
+    Iterator input_it(src, win_input);
     Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, win_output);
+    Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
     if(has_biases)
@@ -482,18 +483,18 @@
 }
 
 template <typename T, typename TW>
-void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                      const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
+void depthwise_loop_generic_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
+                                      const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
 {
     using AccType = int32_t;
 
-    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
+    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
 
-    const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();
+    const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
 
-    const int32_t input_qoffset   = input->info()->quantization_info().uniform().offset;
+    const int32_t input_qoffset   = src->info()->quantization_info().uniform().offset;
     const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
-    const int32_t output_qoffset  = output->info()->quantization_info().uniform().offset;
+    const int32_t output_qoffset  = dst->info()->quantization_info().uniform().offset;
     const int32_t k_offset        = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
 
     Window execution_window = window;
@@ -512,9 +513,9 @@
     Window win_output = window;
     win_output.set_dimension_step(Window::DimX, run_info.x_step);
 
-    Iterator input_it(input, win_input);
+    Iterator input_it(src, win_input);
     Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, win_output);
+    Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
     if(has_biases)
@@ -585,8 +586,8 @@
 }
 
 template <typename T, typename TW>
-void depthwise_loop_pow2_quantized_per_tensor(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                              const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
+void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
+                                              const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
 {
     constexpr int half_vec = vector_size / 2;
 
@@ -595,11 +596,11 @@
     using AccVectorTagType = typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;
     using TagType          = typename wrapper::traits::neon_vector<T, vector_size>::tag_type;
 
-    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
+    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
 
-    const auto input_qoffset_vec   = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<T>(input->info()->quantization_info().uniform().offset), TagType{})));
+    const auto input_qoffset_vec   = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<T>(src->info()->quantization_info().uniform().offset), TagType{})));
     const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));
-    const auto output_qoffset_vec  = wrapper::vdup_n(output->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{});
+    const auto output_qoffset_vec  = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{});
 
     const auto lower = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::lowest()), AccVectorTagType{});
     const auto upper = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{});
@@ -624,9 +625,9 @@
     Window win_output = window;
     win_output.set_dimension_step(Window::DimX, run_info.x_step);
 
-    Iterator input_it(input, win_input);
+    Iterator input_it(src, win_input);
     Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, win_output);
+    Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
     if(has_biases)
@@ -722,16 +723,16 @@
     input_it, weights_it, biases_it, output_it);
 }
 
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info)
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(info.depth_multiplier == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) > input->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) > input->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom());
-    ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(0) * info.depth_multiplier) != weights->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) > src->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) > src->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom());
+    ARM_COMPUTE_RETURN_ERROR_ON((src->dimension(0) * info.depth_multiplier) != weights->dimension(0));
     ARM_COMPUTE_RETURN_ERROR_ON((info.dilation.x() < 1) || (info.dilation.y() < 1));
     ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) || (info.pad_stride_info.stride().second < 1));
 
@@ -742,7 +743,7 @@
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
     }
 
     if(biases != nullptr)
@@ -750,7 +751,7 @@
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
 
-        if(is_data_type_quantized_asymmetric(input->data_type()))
+        if(is_data_type_quantized_asymmetric(src->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -760,36 +761,36 @@
         }
     }
 
-    if(output->total_size() != 0)
+    if(dst->total_size() != 0)
     {
-        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     }
 
     return Status{};
 }
 } // namespace
 
-CpuDepthwiseConvolutionNativeKernel::CpuDepthwiseConvolutionNativeKernel()
+CpuDepthwiseConv2dNativeKernel::CpuDepthwiseConv2dNativeKernel()
     : _func(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift(), _has_biases()
 {
 }
 
-void CpuDepthwiseConvolutionNativeKernel::configure(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info)
+void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, weights, (biases != nullptr) ? biases : nullptr, output, info));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, (biases != nullptr) ? biases : nullptr, dst, info));
 
     _conv_info        = info.pad_stride_info;
     _depth_multiplier = info.depth_multiplier;
     _dilation         = info.dilation;
     _has_biases       = (biases != nullptr);
 
-    if(is_data_type_quantized(input->data_type()))
+    if(is_data_type_quantized(src->data_type()))
     {
-        const auto input_scale  = input->quantization_info().uniform().scale;
-        const auto output_scale = output->quantization_info().uniform().scale;
+        const auto input_scale  = src->quantization_info().uniform().scale;
+        const auto output_scale = dst->quantization_info().uniform().scale;
 
         auto weights_scale = weights->quantization_info().scale();
         if(!is_data_type_quantized_per_channel(weights->data_type()))
@@ -815,50 +816,50 @@
     switch(weights->data_type())
     {
         case DataType::QASYMM8:
-            _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<uint8_t, uint8_t>;
+            _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<uint8_t, uint8_t>;
             break;
         case DataType::QASYMM8_SIGNED:
-            _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<int8_t, int8_t>;
+            _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<int8_t, int8_t>;
             break;
         case DataType::QSYMM8_PER_CHANNEL:
-            if(input->data_type() == DataType::QASYMM8)
+            if(src->data_type() == DataType::QASYMM8)
             {
-                _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<uint8_t, int8_t>;
+                _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<uint8_t, int8_t>;
             }
             else
             {
-                _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<int8_t, int8_t>;
+                _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<int8_t, int8_t>;
             }
             break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<float16_t, float16_t>;
+            _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<float16_t, float16_t>;
             break;
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F32:
-            _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<float, float>;
+            _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<float, float>;
             break;
         default:
             ARM_COMPUTE_ERROR("Data type not supported");
             break;
     }
 
-    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
-    auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
+    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+    auto_init_if_empty(*dst, src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(dst->quantization_info()));
 
-    Window win = calculate_max_window(*output, Steps());
+    Window win = calculate_max_window(*dst, Steps());
     ICpuKernel::configure(win);
 }
 
-Status CpuDepthwiseConvolutionNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info)
+Status CpuDepthwiseConv2dNativeKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, info));
     return Status{};
 }
 
-template <typename T, typename TW, CpuDepthwiseConvolutionNativeKernel::FloatEnalber<T>>
-void CpuDepthwiseConvolutionNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                                        ITensor *dst, const Window &window, bool has_biases)
+template <typename T, typename TW, CpuDepthwiseConv2dNativeKernel::FloatEnalber<T>>
+void CpuDepthwiseConv2dNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
+                                                   ITensor *dst, const Window &window, bool has_biases)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
@@ -873,9 +874,9 @@
     }
 }
 
-template <typename T, typename TW, CpuDepthwiseConvolutionNativeKernel::Quantized8bitEnalber<T>>
-void CpuDepthwiseConvolutionNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                                        ITensor *dst, const Window &window, bool has_biases)
+template <typename T, typename TW, CpuDepthwiseConv2dNativeKernel::Quantized8bitEnalber<T>>
+void CpuDepthwiseConv2dNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
+                                                   ITensor *dst, const Window &window, bool has_biases)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
@@ -900,7 +901,7 @@
     }
 }
 
-void CpuDepthwiseConvolutionNativeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuDepthwiseConv2dNativeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
diff --git a/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
similarity index 62%
rename from src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h
rename to src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
index 242536d..559c46d 100644
--- a/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h
+++ b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONNATIVEKERNEL_H
-#define ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONNATIVEKERNEL_H
+#ifndef ARM_COMPUTE_CPU_DEPTHWISECONV2DNATIVEKERNEL_H
+#define ARM_COMPUTE_CPU_DEPTHWISECONV2DNATIVEKERNEL_H
 
 #include "arm_compute/core/utils/misc/Traits.h"
 #include "src/core/common/Macros.h"
@@ -40,46 +40,38 @@
 namespace kernels
 {
 /** Interface for the kernel to run a depthwise convolution native on a tensor. */
-class CpuDepthwiseConvolutionNativeKernel : public ICpuKernel
+class CpuDepthwiseConv2dNativeKernel : public ICpuKernel
 {
 public:
     const char *name() const override
     {
-        return "CpuDepthwiseConvolutionNativeKernel";
+        return "CpuDepthwiseConv2dNativeKernel";
     }
     /** Default constructor */
-    CpuDepthwiseConvolutionNativeKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConvolutionNativeKernel);
+    CpuDepthwiseConv2dNativeKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dNativeKernel);
 
     /** Initialize the function's source, destination and parameters.
      *
      * @note Supported data layouts: NHWC
      *
-     * @param[in]  input   Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  src     Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  weights Weights tensor. This is a 3D tensor with dimensions [IFM, W, H].
-     *                     Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
+     *                     Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
      * @param[in]  biases  Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                     Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output  Destination tensor. Data type supported: Same as @p input.
+     *                     Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
+     * @param[out] dst     Destination tensor. Data type supported: Same as @p src.
      * @param[in]  info    Depthwise convolution meta-data.
      *
      */
-    void configure(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolutionNativeKernel
+    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @note Supported data layouts: NHWC
-     *
-     * @param[in] input   Source tensor info. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] weights Weights tensor info. This is a 3D tensor with dimensions [IFM, W, H].
-     *                    Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] biases  Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                    Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output  Destination tensor info. Data type supported: Same as @p input.
-     * @param[in] info    Depthwise convolution meta-data.
+     * Similar to CpuDepthwiseConv2dNativeKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info);
+    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
@@ -101,7 +93,7 @@
      *
      * @param[in] window Region on which to execute the kernel.
      */
-    using DepthwiseFunctionPtr = void (CpuDepthwiseConvolutionNativeKernel::*)(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases);
+    using DepthwiseFunctionPtr = void (CpuDepthwiseConv2dNativeKernel::*)(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases);
 
     DepthwiseFunctionPtr _func;
     PadStrideInfo        _conv_info;
@@ -114,4 +106,4 @@
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONNATIVEKERNEL_H */
+#endif /* ARM_COMPUTE_CPU_DEPTHWISECONV2DNATIVEKERNEL_H */
diff --git a/src/core/cpu/kernels/CpuDirectConvolutionKernel.cpp b/src/core/cpu/kernels/CpuDirectConv2dKernel.cpp
similarity index 98%
rename from src/core/cpu/kernels/CpuDirectConvolutionKernel.cpp
rename to src/core/cpu/kernels/CpuDirectConv2dKernel.cpp
index 4f46eb2..c0fc415 100644
--- a/src/core/cpu/kernels/CpuDirectConvolutionKernel.cpp
+++ b/src/core/cpu/kernels/CpuDirectConv2dKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/cpu/kernels/CpuDirectConvolutionKernel.h"
+#include "src/core/cpu/kernels/CpuDirectConv2dKernel.h"
 
 #include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
 #include "src/core/NEON/wrapper/wrapper.h"
@@ -995,7 +995,7 @@
 } // namespace
 
 template <typename T>
-void CpuDirectConvolutionKernel::convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
+void CpuDirectConv2dKernel::convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
 {
     // This function assumes that input and weights have not padding in channel
 
@@ -1116,7 +1116,7 @@
 }
 
 template <typename T>
-void CpuDirectConvolutionKernel::convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
+void CpuDirectConv2dKernel::convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
 {
     // Declare useful types
     using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
@@ -1219,12 +1219,12 @@
     out);
 }
 
-BorderSize CpuDirectConvolutionKernel::border_size() const
+BorderSize CpuDirectConv2dKernel::border_size() const
 {
     return _border_size;
 }
 
-void CpuDirectConvolutionKernel::configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info)
+void CpuDirectConv2dKernel::configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
 
@@ -1263,7 +1263,7 @@
     ICpuKernel::configure(win_config.second);
 }
 
-Status CpuDirectConvolutionKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
+Status CpuDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
 {
     unsigned int num_weight_elems_read_per_row   = 0;
     unsigned int num_elems_read_per_iteration    = 0;
@@ -1283,7 +1283,7 @@
     return Status{};
 }
 
-void CpuDirectConvolutionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -1376,7 +1376,7 @@
         }
     }
 }
-const char *CpuDirectConvolutionKernel::name() const
+const char *CpuDirectConv2dKernel::name() const
 {
     return "CpuDirectConvolutionLayerKernel";
 }
diff --git a/src/core/cpu/kernels/CpuDirectConvolutionKernel.h b/src/core/cpu/kernels/CpuDirectConv2dKernel.h
similarity index 75%
rename from src/core/cpu/kernels/CpuDirectConvolutionKernel.h
rename to src/core/cpu/kernels/CpuDirectConv2dKernel.h
index fb82183..62ed96f 100644
--- a/src/core/cpu/kernels/CpuDirectConvolutionKernel.h
+++ b/src/core/cpu/kernels/CpuDirectConv2dKernel.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_DIRECTCONVOLUTION_KERNEL_H
-#define ARM_COMPUTE_CPU_DIRECTCONVOLUTION_KERNEL_H
+#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_KERNEL_H
+#define ARM_COMPUTE_CPU_DIRECTCONV2D_KERNEL_H
 
 #include "src/core/common/Macros.h"
 #include "src/core/cpu/ICpuKernel.h"
@@ -35,13 +35,13 @@
 namespace kernels
 {
 /** Interface for the kernel to perform Direct Convolution Layer. */
-class CpuDirectConvolutionKernel : public ICpuKernel
+class CpuDirectConv2dKernel : public ICpuKernel
 {
 public:
     /** Default constructor */
-    CpuDirectConvolutionKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConvolutionKernel);
-    /** Set the input, weights, and output tensors.
+    CpuDirectConv2dKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv2dKernel);
+    /** Set the src, weights, and dst tensors.
      *
      * @note: DirectConvolution only works in the following configurations:
      *        1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
@@ -57,16 +57,9 @@
      * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
      */
     void configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuDirectConvolutionKernel
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @param[in] src       The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
-     *                      while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
-     * @param[in] weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                      The 3rd dimension must be the same as the input's volume 3rd dimension.
-     *                      Data type supported:Same as @p input.
-     * @param[in] dst       Output tensor.
-     *                      The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32
-     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     * Similar to CpuDirectConv2dKernel::configure()
      *
      * @return a status
      */
@@ -97,4 +90,4 @@
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_DIRECTCONVOLUTION_KERNEL_H */
+#endif /*ARM_COMPUTE_CPU_DIRECTCONV2D_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.cpp b/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
similarity index 95%
rename from src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.cpp
rename to src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
index 5f7a574..662d052 100644
--- a/src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.cpp
+++ b/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h"
+#include "src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -384,8 +384,8 @@
 }
 } // namespace
 
-void CpuDirectConvolutionOutputStageKernel::configure(ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
-                                                      const DirectConvolutionLayerOutputStageKernelInfo &info)
+void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
+                                                 const DirectConvolutionLayerOutputStageKernelInfo &info)
 {
     ARM_COMPUTE_UNUSED(bias);
     // Perform validation step
@@ -483,14 +483,14 @@
     }
 }
 
-Status CpuDirectConvolutionOutputStageKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
-                                                       const DirectConvolutionLayerOutputStageKernelInfo &info)
+Status CpuDirectConv2dOutputStageKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
+                                                  const DirectConvolutionLayerOutputStageKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info));
     return Status{};
 }
 
-void CpuDirectConvolutionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuDirectConv2dOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -504,9 +504,9 @@
     (*_func)(src, bias, window, dst, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift);
 }
 
-const char *CpuDirectConvolutionOutputStageKernel::name() const
+const char *CpuDirectConv2dOutputStageKernel::name() const
 {
-    return "CpuDirectConvolutionOutputStageKernel";
+    return "CpuDirectConv2dOutputStageKernel";
 }
 } // namespace kernels
 } // namespace cpu
diff --git a/src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h b/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
similarity index 71%
rename from src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h
rename to src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
index 9eeab19..62bc5d4 100644
--- a/src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h
+++ b/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_DIRECTCONVOLUTION_OUTPUTSTAGE_KERNEL_H
-#define ARM_COMPUTE_CPU_DIRECTCONVOLUTION_OUTPUTSTAGE_KERNEL_H
+#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_OUTPUTSTAGE_KERNEL_H
+#define ARM_COMPUTE_CPU_DIRECTCONV2D_OUTPUTSTAGE_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
 #include "src/core/common/Macros.h"
@@ -41,33 +41,27 @@
  * @note For quantized computations (i.e. @p src of S32 type) the output data type for auto-initialization must be passed as part
  *       of the @ref DirectConvolutionLayerOutputStageKernelInfo.
  */
-class CpuDirectConvolutionOutputStageKernel : public ICpuKernel
+class CpuDirectConv2dOutputStageKernel : public ICpuKernel
 {
 public:
     /** Default constructor */
-    CpuDirectConvolutionOutputStageKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConvolutionOutputStageKernel);
+    CpuDirectConv2dOutputStageKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv2dOutputStageKernel);
     /** Set the accumulate buffer and the biases of the kernel.
      *
-     * @param[in, out] src  Input to add the bias to. If @p output is not specified then accumulation is done in-place.
+     * @param[in, out] src  Input to add the bias to. If @p dst is not specified then accumulation is done in-place.
      *                      Data type supported: F16/F32/S32
      * @param[in]      bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p src
-     * @param[out]     dst  (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
+     * @param[out]     dst  (Optional) If the dst tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
      *                      Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr.
      *                      Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p src is S32
      * @param[in]      info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata
      */
     void configure(ITensorInfo *src, const ITensorInfo *bias = nullptr, ITensorInfo *dst = nullptr,
                    const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuDirectConvolutionOutputStageKernel
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @param[in] src  Input to add the bias to. If @p output is not specified then accumulation is done in-place.
-     *                 Data type supported: F16/F32/S32
-     * @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p src
-     * @param[in] dst  (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
-     *                 Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr.
-     *                 Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p src is S32
-     * @param[in] info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata
+     * Similar to CpuDirectConv2dOutputStageKernel::configure()
      *
      * @return a status
      */
@@ -90,4 +84,4 @@
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_DIRECTCONVOLUTION_OUTPUTSTAGE_KERNEL_H */
+#endif /*ARM_COMPUTE_CPU_DIRECTCONV2D_OUTPUTSTAGE_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuPoolingKernel.cpp b/src/core/cpu/kernels/CpuPool2dKernel.cpp
similarity index 97%
rename from src/core/cpu/kernels/CpuPoolingKernel.cpp
rename to src/core/cpu/kernels/CpuPool2dKernel.cpp
index a55f60d..e6f5890 100644
--- a/src/core/cpu/kernels/CpuPoolingKernel.cpp
+++ b/src/core/cpu/kernels/CpuPool2dKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/cpu/kernels/CpuPoolingKernel.h"
+#include "src/core/cpu/kernels/CpuPool2dKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -374,12 +374,12 @@
 }
 } // namespace
 
-BorderSize CpuPoolingKernel::border_size() const
+BorderSize CpuPool2dKernel::border_size() const
 {
     return _border_size;
 }
 
-void CpuPoolingKernel::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
+void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     const PadStrideInfo pad_stride_info   = pool_info.pad_stride_info;
@@ -420,7 +420,7 @@
     }
 }
 
-Status CpuPoolingKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status CpuPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
 
@@ -446,7 +446,7 @@
     return Status{};
 }
 
-void CpuPoolingKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuPool2dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -505,9 +505,9 @@
     uk->ukernel(src, dst, indices, _pool_info, window_src, window);
 }
 
-const char *CpuPoolingKernel::name() const
+const char *CpuPool2dKernel::name() const
 {
-    return "CpuPoolingKernel";
+    return "CpuPool2dKernel";
 }
 } // namespace kernels
 } // namespace cpu
diff --git a/src/core/cpu/kernels/CpuPoolingKernel.h b/src/core/cpu/kernels/CpuPool2dKernel.h
similarity index 78%
rename from src/core/cpu/kernels/CpuPoolingKernel.h
rename to src/core/cpu/kernels/CpuPool2dKernel.h
index 87d8f67..9529800 100644
--- a/src/core/cpu/kernels/CpuPoolingKernel.h
+++ b/src/core/cpu/kernels/CpuPool2dKernel.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_POOLING_KERNEL_H
-#define ARM_COMPUTE_CPU_POOLING_KERNEL_H
+#ifndef ARM_COMPUTE_CPU_POOL2D_KERNEL_H
+#define ARM_COMPUTE_CPU_POOL2D_KERNEL_H
 
 #include "arm_compute/core/Types.h"
 #include "src/core/common/Macros.h"
@@ -35,12 +35,12 @@
 namespace kernels
 {
 /** Interface for the pooling layer kernel */
-class CpuPoolingKernel : public ICpuKernel
+class CpuPool2dKernel : public ICpuKernel
 {
 public:
     /** Default constructor */
-    CpuPoolingKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPoolingKernel);
+    CpuPool2dKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2dKernel);
     /** Configure kernel for a given list of arguments
      *
      * @note F16 are supported for pool sizes 2 and 3 only
@@ -51,14 +51,9 @@
      * @param[out] indices   (optional) The indices of the maximal values. Data type supported: U32.
      */
     void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuPoolingKernel
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @note F16 are supported for pool sizes 2 and 3 only
-     *
-     * @param[in] src       Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] dst       Destination tensor info. Data types supported: Same as @p src.
-     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[in] indices   (optional) The indices of the maximal values. Data type supported: U32.
+     * Similar to CpuPool2dKernel::configure()
      *
      * @return a status
      */
@@ -80,4 +75,4 @@
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_POOLING_KERNEL_H */
+#endif /*ARM_COMPUTE_CPU_POOL2D_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
similarity index 92%
rename from src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp
rename to src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
index ccf7388..c78ffb9 100644
--- a/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp
+++ b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h"
+#include "src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
@@ -41,7 +41,7 @@
 {
 using namespace arm_compute::misc::shape_calculator;
 
-void CpuPoolingAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -88,7 +88,7 @@
     INEKernel::configure(win);
 }
 
-Status CpuPoolingAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info)
+Status CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
 
@@ -136,7 +136,7 @@
     return Status{};
 }
 
-void CpuPoolingAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuPool2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get());
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -170,18 +170,18 @@
                          working_space, info.thread_id, info.num_threads);
 }
 
-size_t CpuPoolingAssemblyWrapperKernel::get_working_size(unsigned int num_threads) const
+size_t CpuPool2dAssemblyWrapperKernel::get_working_size(unsigned int num_threads) const
 {
     return _kernel_asm->get_working_size(num_threads);
 }
 
-bool CpuPoolingAssemblyWrapperKernel::is_configured() const
+bool CpuPool2dAssemblyWrapperKernel::is_configured() const
 {
     return _kernel_asm != nullptr;
 }
 
 template <typename Typesrc, typename Typedst>
-void CpuPoolingAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
 {
     const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
 
@@ -220,7 +220,7 @@
 }
 
 template <typename Typesrc, typename Typedst>
-void CpuPoolingAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
 {
     const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
 
diff --git a/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
similarity index 78%
rename from src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h
rename to src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
index 34ec452..3afa4c1 100644
--- a/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h
+++ b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_POOLING_ASSEMBLY_WRAPPER_KERNEL_H
-#define ARM_COMPUTE_CPU_POOLING_ASSEMBLY_WRAPPER_KERNEL_H
+#ifndef ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H
+#define ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H
 
 #include "arm_compute/core/Types.h"
 #include "src/core/NEON/kernels/assembly/pooling.hpp"
@@ -41,23 +41,21 @@
   *
   * Some kernels were written in assembly and highly optimised for specific
   * CPUs like A53 or A55. The arm compute library creates an instance of
-  * CpuPoolingAssemblyWrapperKernel and other auxiliary data structures to
+  * CpuPool2dAssemblyWrapperKernel and other auxiliary data structures to
   * execute a single assembly kernel in the context of an NEFunction.
   *
   */
-class CpuPoolingAssemblyWrapperKernel final : public ICpuKernel
+class CpuPool2dAssemblyWrapperKernel final : public ICpuKernel
 {
 public:
     /** Constructor
      */
-    CpuPoolingAssemblyWrapperKernel()                                   = default;
-    CpuPoolingAssemblyWrapperKernel(CpuPoolingAssemblyWrapperKernel &)  = delete;
-    CpuPoolingAssemblyWrapperKernel(CpuPoolingAssemblyWrapperKernel &&) = default;
-    CpuPoolingAssemblyWrapperKernel &operator=(CpuPoolingAssemblyWrapperKernel &) = delete;
+    CpuPool2dAssemblyWrapperKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2dAssemblyWrapperKernel);
 
     const char *name() const override
     {
-        return "CpuPoolingAssemblyWrapperKernel";
+        return "CpuPool2dAssemblyWrapperKernel";
     }
 
     /** Initialise the kernel's src and dst.
@@ -69,13 +67,11 @@
      */
     void configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
 
-    /** Indicates whether or not this function can be used to process the given parameters.
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @param[in] src  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] dst  Destination tensor to store the result of pooling. Data types supported: same as @p src.
-     * @param[in] info Pooling meta-data
+     * Similar to CpuPool2dAssemblyWrapperKernel::configure()
      *
-     * @return a status.
+     * @return a status
      */
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info);
 
@@ -120,4 +116,4 @@
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_POOLING_ASSEMBLY_WRAPPER_KERNEL_H */
+#endif /* ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp b/src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp
similarity index 97%
rename from src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp
rename to src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp
index 0a5101f..2c9a4f3 100644
--- a/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/gpu/cl/kernels/ClDirectConvolutionKernel.h"
+#include "src/core/gpu/cl/kernels/ClDirectConv2dKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
@@ -369,13 +369,13 @@
 
 } // namespace
 
-BorderSize ClDirectConvolutionKernel::border_size() const
+BorderSize ClDirectConv2dKernel::border_size() const
 {
     return _border_size;
 }
 
-void ClDirectConvolutionKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                                          const PadStrideInfo &conv_info)
+void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+                                     const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
 
@@ -564,8 +564,8 @@
     _config_id += lower_string(string_from_data_layout(_data_layout));
 }
 
-Status ClDirectConvolutionKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                                           const GPUTarget target)
+Status ClDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+                                      const GPUTarget target)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), weights->clone().get(), dst->clone().get(), conv_info, target).first);
@@ -573,7 +573,7 @@
     return Status{};
 }
 
-void ClDirectConvolutionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
diff --git a/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.h b/src/core/gpu/cl/kernels/ClDirectConv2dKernel.h
similarity index 72%
rename from src/core/gpu/cl/kernels/ClDirectConvolutionKernel.h
rename to src/core/gpu/cl/kernels/ClDirectConv2dKernel.h
index 384b561..ec76624 100644
--- a/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.h
+++ b/src/core/gpu/cl/kernels/ClDirectConv2dKernel.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_DIRECT_CONVOLUTION_KERNEL_H
-#define ARM_COMPUTE_CL_DIRECT_CONVOLUTION_KERNEL_H
+#ifndef ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H
+#define ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H
 
 #include "src/core/common/Macros.h"
 #include "src/core/gpu/cl/ClCompileContext.h"
@@ -36,11 +36,11 @@
 {
 /** Interface for the  direct convolution kernel.
  */
-class ClDirectConvolutionKernel : public IClKernel
+class ClDirectConv2dKernel : public IClKernel
 {
 public:
-    ClDirectConvolutionKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDirectConvolutionKernel);
+    ClDirectConv2dKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDirectConv2dKernel);
     /** Set the src, weights, biases and dst tensors info.
      *
      * @note: Due to set_valid_region(), thus src/weights/biases cannot be const. Need to change this once the set_valid_region() is removed.
@@ -64,19 +64,9 @@
      * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
      */
     void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref ClDirectConvolutionKernel
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @param[in] src       The src tensor info to convolve. 3 lower dimensions represent a single src [width, height, IFM],
-     *                      while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[in] weights   Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                      The 3rd dimension must be the same as the src's volume 3rd dimension.
-     *                      Data type supported:Same as @p src.
-     * @param[in] biases    Biases tensor info. Biases are 1D tensor with dimension [OFM].
-     *                      Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type.
-     * @param[in] dst       Output tensor info.
-     *                      The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p src.
-     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] target    Target GPU architecture.
+     * Similar to ClDirectConv2dKernel::configure()
      *
      * @return a status
      */
@@ -94,4 +84,4 @@
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_DIRECT_CONVOLUTION_KERNEL_H */
+#endif /*ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClPoolingKernel.cpp b/src/core/gpu/cl/kernels/ClPool2dKernel.cpp
similarity index 97%
rename from src/core/gpu/cl/kernels/ClPoolingKernel.cpp
rename to src/core/gpu/cl/kernels/ClPool2dKernel.cpp
index 08a3ce3..0e15bff 100644
--- a/src/core/gpu/cl/kernels/ClPoolingKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClPool2dKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/gpu/cl/kernels/ClPoolingKernel.h"
+#include "src/core/gpu/cl/kernels/ClPool2dKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
@@ -202,17 +202,17 @@
 }
 } // namespace
 
-ClPoolingKernel::ClPoolingKernel()
+ClPool2dKernel::ClPool2dKernel()
     : _pool_info(), _data_layout(DataLayout::UNKNOWN), _border_size(0), _num_elems_processed_per_iteration(1)
 {
 }
 
-BorderSize ClPoolingKernel::border_size() const
+BorderSize ClPool2dKernel::border_size() const
 {
     return _border_size;
 }
 
-void ClPoolingKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
+void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -422,7 +422,7 @@
     ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info));
 }
 
-Status ClPoolingKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status ClPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices));
     ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(src->clone().get(), dst->clone().get(), pool_info)));
@@ -430,7 +430,7 @@
     return Status{};
 }
 
-void ClPoolingKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
diff --git a/src/core/gpu/cl/kernels/ClPoolingKernel.h b/src/core/gpu/cl/kernels/ClPool2dKernel.h
similarity index 80%
rename from src/core/gpu/cl/kernels/ClPoolingKernel.h
rename to src/core/gpu/cl/kernels/ClPool2dKernel.h
index c1ce859..8ecb8eb 100644
--- a/src/core/gpu/cl/kernels/ClPoolingKernel.h
+++ b/src/core/gpu/cl/kernels/ClPool2dKernel.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_POOLING_KERNEL_H
-#define ARM_COMPUTE_CL_POOLING_KERNEL_H
+#ifndef ARM_COMPUTE_CL_POOL2D_KERNEL_H
+#define ARM_COMPUTE_CL_POOL2D_KERNEL_H
 
 #include "src/core/common/Macros.h"
 #include "src/core/gpu/cl/ClCompileContext.h"
@@ -35,12 +35,12 @@
 namespace kernels
 {
 /** Interface for the pooling layer kernel */
-class ClPoolingKernel : public IClKernel
+class ClPool2dKernel : public IClKernel
 {
 public:
     /** Default constructor */
-    ClPoolingKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClPoolingKernel);
+    ClPool2dKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClPool2dKernel);
 
     /** Configure kernel for a given list of arguments
      *
@@ -52,12 +52,9 @@
      * @param[out] indices         (optional) The indices of the maximal values. Data type supported: U32.
      */
     void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref ClPoolingKernel
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @param[in] src       Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] dst       Destination tensor info. Data types supported: same as @p src.
-     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[in] indices   (optional) The indices of the maximal values. Data type supported: U32.
+     * Similar to ClPool2dKernel::configure()
      *
      * @return a status
      */
@@ -76,4 +73,4 @@
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_POOLING_KERNEL_H */
+#endif /* ARM_COMPUTE_CL_POOL2D_KERNEL_H */