Make NEON Pooling kernels and functions state-less

Partially resolves COMPMID-3999

Change-Id: Ib39d40694df5c5f0a9401488e0c3af3ac26e8c55
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4984
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/Android.bp b/Android.bp
index bc5ae34..5653fc8 100644
--- a/Android.bp
+++ b/Android.bp
@@ -289,7 +289,6 @@
         "src/core/NEON/kernels/NENormalizationLayerKernel.cpp",
         "src/core/NEON/kernels/NEPadLayerKernel.cpp",
         "src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp",
-        "src/core/NEON/kernels/NEPoolingLayerKernel.cpp",
         "src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp",
         "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp",
         "src/core/NEON/kernels/NEQuantizationLayerKernel.cpp",
@@ -340,7 +339,6 @@
         "src/core/NEON/kernels/arm_gemm/quantized.cpp",
         "src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp",
         "src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp",
-        "src/core/NEON/kernels/assembly/NEPoolingAssemblyWrapperKernel.cpp",
         "src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp",
         "src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp",
         "src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp",
@@ -409,6 +407,8 @@
         "src/core/cpu/kernels/CpuFillKernel.cpp",
         "src/core/cpu/kernels/CpuFloorKernel.cpp",
         "src/core/cpu/kernels/CpuPermuteKernel.cpp",
+        "src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp",
+        "src/core/cpu/kernels/CpuPoolingKernel.cpp",
         "src/core/cpu/kernels/CpuReshapeKernel.cpp",
         "src/core/cpu/kernels/CpuSubKernel.cpp",
         "src/core/cpu/kernels/activation/NEON/fp16.cpp",
@@ -736,7 +736,6 @@
         "src/runtime/NEON/functions/NEPermute.cpp",
         "src/runtime/NEON/functions/NEPhase.cpp",
         "src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp",
-        "src/runtime/NEON/functions/NEPoolingAssemblyDispatch.cpp",
         "src/runtime/NEON/functions/NEPoolingLayer.cpp",
         "src/runtime/NEON/functions/NEPriorBoxLayer.cpp",
         "src/runtime/NEON/functions/NEQLSTMLayer.cpp",
@@ -796,6 +795,8 @@
         "src/runtime/cpu/operators/CpuFill.cpp",
         "src/runtime/cpu/operators/CpuFloor.cpp",
         "src/runtime/cpu/operators/CpuPermute.cpp",
+        "src/runtime/cpu/operators/CpuPooling.cpp",
+        "src/runtime/cpu/operators/CpuPoolingAssemblyDispatch.cpp",
         "src/runtime/cpu/operators/CpuReshape.cpp",
         "src/runtime/cpu/operators/CpuSub.cpp",
         "src/runtime/gpu/cl/operators/ClActivation.cpp",
diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
index d239138..91b3a70 100644
--- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
@@ -32,17 +32,15 @@
 
 namespace arm_compute
 {
+// Forward declarations
 class ITensor;
 class ITensorInfo;
-class NEPoolingLayerKernel;
-class NEFillBorderKernel;
-class NEPoolingAssemblyDispatch;
 
 /** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following NEON kernels:
  *
  * -# @ref NEFillBorderKernel (executed if padding size is different from zero)
- * -# @ref NEPoolingLayerKernel
- * -# @ref NEPoolingAssemblyDispatch
+ * -# @ref cpu::kernels::CpuPoolingKernel
+ * -# @ref cpu::CpuPoolingAssemblyDispatch
  */
 class NEPoolingLayer : public IFunction
 {
@@ -86,14 +84,8 @@
     void run() override;
 
 private:
-    std::shared_ptr<IMemoryManager> _memory_manager;
-
-    std::unique_ptr<NEPoolingLayerKernel>      _pooling_layer_kernel;
-    std::unique_ptr<NEFillBorderKernel>        _border_handler;
-    std::unique_ptr<NEPoolingAssemblyDispatch> _asm_glue;
-
-    bool       _is_global_pooling_layer;
-    DataLayout _data_layout;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 }
 #endif /* ARM_COMPUTE_NEPOOLINGLAYER_H */
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index 735f60a..ab2495d 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -166,7 +166,7 @@
    - NELocallyConnectedMatrixMultiplyKernel
    - @ref NEGEMMLowpOffsetContributionKernel
    - @ref NEGEMMTranspose1xWKernel
-   - @ref NEPoolingLayerKernel
+   - NEPoolingLayerKernel
    - @ref NEConvolutionKernel
    - @ref NEDepthwiseConvolutionLayerNativeKernel
    - @ref NEGEMMLowpMatrixMultiplyKernel
@@ -1120,7 +1120,7 @@
  - Added QASYMM8 support to the following NEON kernels:
     - NEDepthwiseConvolutionLayer3x3Kernel
     - @ref NEFillBorderKernel
-    - @ref NEPoolingLayerKernel
+    - NEPoolingLayerKernel
  - Added new examples:
     - graph_cl_mobilenet_qasymm8.cpp
     - graph_inception_v3.cpp
@@ -1299,7 +1299,7 @@
  - New NEON kernels / functions:
    - NEActivationLayerKernel / @ref NEActivationLayer
    - GEMM refactoring + FP16 support (Requires armv8.2 CPU): @ref NEGEMMInterleave4x4Kernel, @ref NEGEMMTranspose1xWKernel, @ref NEGEMMMatrixMultiplyKernel, @ref NEGEMMMatrixAdditionKernel / @ref NEGEMM
-   - @ref NEPoolingLayerKernel / @ref NEPoolingLayer
+   - NEPoolingLayerKernel / @ref NEPoolingLayer
 
 v17.02.1 Sources preview
  - New OpenCL kernels / functions:
diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h
index 87eec38..c636e5b 100644
--- a/src/core/NEON/NEKernels.h
+++ b/src/core/NEON/NEKernels.h
@@ -101,7 +101,6 @@
 #include "src/core/NEON/kernels/NENormalizationLayerKernel.h"
 #include "src/core/NEON/kernels/NEPadLayerKernel.h"
 #include "src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
-#include "src/core/NEON/kernels/NEPoolingLayerKernel.h"
 #include "src/core/NEON/kernels/NEPriorBoxLayerKernel.h"
 #include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
 #include "src/core/NEON/kernels/NEQuantizationLayerKernel.h"
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index 4880790..10384d4 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,12 +33,8 @@
 #include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "src/core/helpers/WindowHelpers.h"
 
-#include <algorithm>
-#include <cstdint>
-
 namespace arm_compute
 {
-class Coordinates;
 namespace
 {
 inline void fill_constant_value_single_channel_special(ITensor *tensor, const Window &window, unsigned int right, unsigned int bottom, const PixelValue &constant_border_value)
@@ -100,20 +96,26 @@
 void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
-    ARM_COMPUTE_ERROR_ON(tensor->info()->data_type() == DataType::UNKNOWN);
+    _tensor = tensor;
+    configure(tensor->info(), border_size, border_mode, constant_border_value);
+}
 
-    _tensor                = tensor;
+void NEFillBorderKernel::configure(ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
+    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
+    ARM_COMPUTE_ERROR_ON(tensor->data_type() == DataType::UNKNOWN);
+
     _border_size           = border_size;
     _mode                  = border_mode;
     _constant_border_value = constant_border_value;
 
-    _border_size.limit(tensor->info()->padding());
+    _border_size.limit(tensor->padding());
 
     Window win;
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
     win.set(Window::DimY, Window::Dimension(0, 1, 1));
-    win.use_tensor_dimensions(_tensor->info()->tensor_shape(), Window::DimZ);
+    win.use_tensor_dimensions(tensor->tensor_shape(), Window::DimZ);
     INEKernel::configure(win);
 }
 
@@ -156,6 +158,12 @@
     }
 }
 
+void NEFillBorderKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    _tensor = tensors.get_tensor(TensorType::ACL_SRC_DST);
+    run(window, info);
+}
+
 void NEFillBorderKernel::fill_replicate_single_channel(const Window &window)
 {
     uint8_t *const start_valid_region = _tensor->ptr_to_element(_tensor->info()->valid_region().anchor);
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.h b/src/core/NEON/kernels/NEFillBorderKernel.h
index 65908be..2c85158 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.h
+++ b/src/core/NEON/kernels/NEFillBorderKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -65,9 +65,21 @@
      *
      */
     void configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    /** Initialise the function.
+     *
+     * @note This kernel fills the borders within the XY-planes.
+     *
+     * @param[in,out] tensor                Tensor info to process. Data types supported: All.
+     * @param[in]     border_size           Size of the border to fill in elements.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
 
 private:
     void fill_replicate_single_channel(const Window &window);
diff --git a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
index 8cdfe2b..f422728 100644
--- a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
+++ b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,7 +56,7 @@
      *
      * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  indices   Tensor containing the offset to store the input elements in the output tensor.
-     *                       @ref NEPoolingLayerKernel with indices should precede this function in order to
+     *                       @ref cpu::kernels::CpuPoolingKernel with indices should precede this function in order to
      *                       properly reconstruct the output tensor.
      *                       The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
      * @param[out] output    Destination tensor. Data types supported: Same as @p input.
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.h b/src/core/NEON/kernels/NEPoolingLayerKernel.h
deleted file mode 100644
index aa3d2f3..0000000
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.h
+++ /dev/null
@@ -1,229 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H
-#define ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the pooling layer kernel */
-class NEPoolingLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEPoolingLayerKernel";
-    }
-    /** Default constructor */
-    NEPoolingLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEPoolingLayerKernel(const NEPoolingLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEPoolingLayerKernel &operator=(const NEPoolingLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEPoolingLayerKernel(NEPoolingLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEPoolingLayerKernel &operator=(NEPoolingLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEPoolingLayerKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @note F16 are supported for pool sizes 2 and 3 only
-     *
-     * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
-     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[out] indices   (optional) The indices of the maximal values. Data type supported: U32.
-     */
-    void configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info, ITensor *indices = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEPoolingLayerKernel
-     *
-     * @note F16 are supported for pool sizes 2 and 3 only
-     *
-     * @param[in] input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] output    Destination tensor. Data types supported: Same as @p input.
-     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[in] indices   (optional) The indices of the maximal values. Data type supported: U32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Function to perform 2x2 pooling.
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    void pooling2_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Function to perform 2x2 pooling and compute the pooling indices. The indices can be used for max unpool.
-     *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
-     */
-    void pooling2_f32_nhwc_maxpool_indices(const Window &window_input, const Window &window);
-    /** Function to perform MxN pooling for 32-bit floating point values.
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    void poolingMxN_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Function to perform MxN pooling for 32-bit floating point values (NHWC).
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    void poolingMxN_f32_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Function to perform 7x7 pooling.
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    void pooling7_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Function to perform 3x3 pooling.
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    void pooling3_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Function to perform 2x2 pooling for float16_t.
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    void pooling2_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Function to perform 2x2 pooling and compute the pooling indices for FP32/FP16. The indices can be used for max unpool.
-     *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
-     */
-    template <typename T>
-    void pooling2_nchw_maxpool_indices(const Window &window_input, const Window &window);
-    /** Function to perform 2x2 pooling and compute the pooling indices. The indices can be used for max unpool.
-     *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
-     */
-    void pooling2_f16_nhwc_maxpool_indices(const Window &window_input, const Window &window);
-    /** Function to perform 3x3 pooling.
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    void pooling3_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Function to perform MxN pooling for 16-bit floating point values.
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    void poolingMxN_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Function to perform MxN pooling for 16-bit floating point values. (NHWC)
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    void poolingMxN_f16_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Template function to perform 2x2 pooling for 8bit quantized fixed point. (NCHW)
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    template <typename T>
-    void pooling2_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Template function to perform 3x3 pooling for 8bit quantized fixed point. (NCHW)
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    template <typename T>
-    void pooling3_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Template function to perform MxN pooling for 8-bit quantized. (NCHW)
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    template <typename T>
-    void poolingMxN_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Template function to perform MxN pooling for 8-bit quantized. (NHWC)
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    template <typename T>
-    void poolingMxN_q8_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Common signature for all the specialised Pooling functions
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    using PoolingFunction = void (NEPoolingLayerKernel::*)(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding);
-
-private:
-    PoolingFunction  _func;
-    const ITensor   *_input;
-    ITensor         *_output;
-    ITensor         *_indices;
-    PoolingLayerInfo _pool_info;
-    DataLayout       _data_layout;
-    unsigned int     _num_elems_processed_per_iteration;
-    BorderSize       _border_size;
-    bool             _is_square;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/assembly/NEPoolingAssemblyWrapperKernel.cpp b/src/core/NEON/kernels/assembly/NEPoolingAssemblyWrapperKernel.cpp
deleted file mode 100644
index 0440666..0000000
--- a/src/core/NEON/kernels/assembly/NEPoolingAssemblyWrapperKernel.cpp
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/assembly/NEPoolingAssemblyWrapperKernel.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-
-void NEPoolingAssemblyWrapperKernel::configure(const ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output initialization if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_pool_shape(*input, info)));
-
-    const bool requantize = input->quantization_info() != output->quantization_info();
-
-    switch(input->data_type())
-    {
-        case DataType::QASYMM8:
-            if(requantize)
-            {
-                create_arm_pooling_requant<uint8_t, uint8_t>(input, output, info, cpu_info);
-            }
-            else
-            {
-                create_arm_pooling<uint8_t, uint8_t>(input, output, info, cpu_info);
-            }
-            break;
-        case DataType::QASYMM8_SIGNED:
-            if(requantize)
-            {
-                create_arm_pooling_requant<int8_t, int8_t>(input, output, info, cpu_info);
-            }
-            else
-            {
-                create_arm_pooling<int8_t, int8_t>(input, output, info, cpu_info);
-            }
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            create_arm_pooling<float16_t, float16_t>(input, output, info, cpu_info);
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        case DataType::F32:
-            create_arm_pooling<float, float>(input, output, info, cpu_info);
-            break;
-        default:
-            break;
-    }
-
-    Window win = calculate_max_window(*output, Steps());
-    INEKernel::configure(win);
-}
-
-Status NEPoolingAssemblyWrapperKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-
-#ifndef __aarch64__
-    ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
-#endif /* __aarch64__ */
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->data_layout() != DataLayout::NHWC) || (info.data_layout != DataLayout::NHWC), "Only NHWC is supported by assembly kernels");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.pool_type != PoolingType::AVG) && (info.pool_type != PoolingType::MAX),
-                                    "Only AVG and MAX pooling are supported by assembly kernels");
-
-    if(output->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-        const auto input_qinfo  = input->quantization_info().uniform();
-        const auto output_qinfo = output->quantization_info().uniform();
-
-        if(input_qinfo != output_qinfo)
-        {
-            const float multiplier = input_qinfo.scale / output_qinfo.scale;
-            int32_t     output_multiplier{};
-            int32_t     output_shift{};
-            ARM_COMPUTE_RETURN_ERROR_ON(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
-        }
-        else
-        {
-            if(input->data_type() == DataType::QASYMM8)
-            {
-                const bool has_padding = info.pad_stride_info.has_padding();
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same input/output quantization info");
-            }
-        }
-    }
-    else
-    {
-        if(input->data_type() == DataType::QASYMM8)
-        {
-            // If output is not configured, the quantization info are the same
-            const bool has_padding = info.pad_stride_info.has_padding();
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same input/output quantization info");
-        }
-    }
-    return Status{};
-}
-
-void NEPoolingAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get());
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_UNUSED(window);
-    ARM_COMPUTE_UNUSED(info);
-
-    ARM_COMPUTE_ERROR_ON(tensors.empty());
-
-    const ITensor *input     = tensors.get_const_tensor(TensorType::ACL_SRC);
-    ITensor       *output    = tensors.get_tensor(TensorType::ACL_DST_0);
-    ITensor       *workspace = tensors.get_tensor(TensorType::ACL_DST_1);
-
-    const auto in_ptr        = input->buffer() + input->info()->offset_first_element_in_bytes();
-    auto       out_ptr       = output->buffer() + output->info()->offset_first_element_in_bytes();
-    auto       working_space = workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
-
-    const auto input_shape    = input->info()->tensor_shape();
-    const auto output_shape   = output->info()->tensor_shape();
-    const auto input_padding  = input->info()->padding();
-    const auto output_padding = output->info()->padding();
-
-    const size_t ld_input_col    = input_shape[0] + input_padding.left + input_padding.right;
-    const size_t ld_input_row    = ld_input_col * (input_shape[1] + input_padding.top + input_padding.bottom);
-    const size_t ld_input_batch  = ld_input_row * input_shape[2];
-    const size_t ld_output_col   = output_shape[0] + output_padding.right;
-    const size_t ld_output_row   = ld_output_col * (output_shape[1] + output_padding.top + output_padding.bottom);
-    const size_t ld_output_batch = ld_output_row * output_shape[2];
-
-    _kernel_asm->execute(in_ptr, ld_input_col, ld_input_row, ld_input_batch,
-                         out_ptr, ld_output_col, ld_output_row, ld_output_batch,
-                         working_space, info.thread_id, info.num_threads);
-}
-
-size_t NEPoolingAssemblyWrapperKernel::get_working_size(unsigned int num_threads) const
-{
-    return _kernel_asm->get_working_size(num_threads);
-}
-
-bool NEPoolingAssemblyWrapperKernel::is_configured() const
-{
-    return _kernel_asm != nullptr;
-}
-
-template <typename TypeInput, typename TypeOutput>
-void NEPoolingAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
-{
-    const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
-
-    arm_conv::pooling::PoolingWindow window{};
-    window.cols = static_cast<unsigned int>(info.pool_size.x());
-    window.rows = static_cast<unsigned int>(info.pool_size.y());
-
-    arm_conv::pooling::PoolingStride stride{};
-    std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
-
-    const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
-
-    constexpr unsigned int idx_width    = 1;
-    constexpr unsigned int idx_height   = 2;
-    constexpr unsigned int idx_channels = 0;
-    constexpr unsigned int idx_batches  = 3;
-
-    const unsigned int n_batches   = input->dimension(idx_batches);
-    const unsigned int input_rows  = input->dimension(idx_height);
-    const unsigned int input_cols  = input->dimension(idx_width);
-    const unsigned int n_channels  = input->dimension(idx_channels);
-    const unsigned int output_rows = output->dimension(idx_height);
-    const unsigned int output_cols = output->dimension(idx_width);
-
-    arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, input_rows, input_cols, n_channels, output_rows, output_cols, padding, nullptr);
-
-    // Configure assembly pooling kernel
-    auto pooling_kernel_asm = arm_conv::pooling::pooling<TypeInput, TypeOutput>(args);
-    if(pooling_kernel_asm == nullptr)
-    {
-        // Configuration not supported: Leave function unconfigured:
-        return;
-    }
-
-    _kernel_asm = std::move(pooling_kernel_asm);
-}
-
-template <typename TypeInput, typename TypeOutput>
-void NEPoolingAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
-{
-    const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
-
-    arm_conv::pooling::PoolingWindow window{};
-    window.cols = static_cast<unsigned int>(info.pool_size.x());
-    window.rows = static_cast<unsigned int>(info.pool_size.y());
-
-    arm_conv::pooling::PoolingStride stride{};
-    std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
-
-    const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
-
-    constexpr unsigned int idx_width    = 1;
-    constexpr unsigned int idx_height   = 2;
-    constexpr unsigned int idx_channels = 0;
-    constexpr unsigned int idx_batches  = 3;
-
-    const unsigned int n_batches   = input->dimension(idx_batches);
-    const unsigned int input_rows  = input->dimension(idx_height);
-    const unsigned int input_cols  = input->dimension(idx_width);
-    const unsigned int n_channels  = input->dimension(idx_channels);
-    const unsigned int output_rows = output->dimension(idx_height);
-    const unsigned int output_cols = output->dimension(idx_width);
-
-    arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, input_rows, input_cols, n_channels, output_rows, output_cols, padding, nullptr);
-
-    const auto input_qinfo  = input->quantization_info().uniform();
-    const auto output_qinfo = output->quantization_info().uniform();
-
-    const float multiplier = input_qinfo.scale / output_qinfo.scale;
-    int32_t     output_multiplier{};
-    int32_t     output_shift{};
-    quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
-
-    const arm_conv::pooling::Requantize32 requant_args(input_qinfo.offset,
-                                                       output_qinfo.offset,
-                                                       output_shift, // left shift
-                                                       0,            // right shift
-                                                       output_multiplier);
-
-    // Configure assembly pooling kernel with requantization
-    auto pooling_kernel_asm = arm_conv::pooling::pooling<TypeInput, TypeOutput, arm_conv::pooling::Requantize32>(args, requant_args);
-    if(pooling_kernel_asm == nullptr)
-    {
-        // Configuration not supported: Leave function unconfigured:
-        return;
-    }
-
-    _kernel_asm = std::move(pooling_kernel_asm);
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/assembly/NEPoolingAssemblyWrapperKernel.h b/src/core/NEON/kernels/assembly/NEPoolingAssemblyWrapperKernel.h
deleted file mode 100644
index b2fa5b5..0000000
--- a/src/core/NEON/kernels/assembly/NEPoolingAssemblyWrapperKernel.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ASSEMBLY_POOLING_KERNEL_WRAPPER_KERNEL_H
-#define ARM_COMPUTE_ASSEMBLY_POOLING_KERNEL_WRAPPER_KERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/NEON/kernels/assembly/pooling.hpp"
-
-#include "pool_common.hpp"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** This class is a wrapper for the assembly kernels.
-  *
-  * Some kernels were written in assembly and highly optimised for specific
-  * CPUs like A53 or A55. The arm compute library creates an instance of
-  * NEPoolingAssemblyWrapperKernel and other auxiliary data structures to
-  * execute a single assembly kernel in the context of an NEFunction.
-  *
-  */
-class NEPoolingAssemblyWrapperKernel final : public INEKernel
-{
-public:
-    /** Constructor
-     */
-    NEPoolingAssemblyWrapperKernel()                                  = default;
-    NEPoolingAssemblyWrapperKernel(NEPoolingAssemblyWrapperKernel &)  = delete;
-    NEPoolingAssemblyWrapperKernel(NEPoolingAssemblyWrapperKernel &&) = default;
-    NEPoolingAssemblyWrapperKernel &operator=(NEPoolingAssemblyWrapperKernel &) = delete;
-
-    const char *name() const override
-    {
-        return "NEPoolingAssemblyWrapperKernel";
-    }
-
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] output Output tensor to store the result of pooling. Data types supported: same as @p input.
-     * @param[in]  info   Pooling meta-data
-     */
-    void configure(const ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
-
-    /** Indicates whether or not this function can be used to process the given parameters.
-     *
-     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] output Output tensor to store the result of pooling. Data types supported: same as @p input.
-     * @param[in] info   Pooling meta-data
-     *
-     * @return a status.
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-
-    /** Get size of the workspace needed by the assembly kernel.
-     *
-     * @param[in] num_threads Maximum number of threads that are going to be spawned.
-     *
-     * @return size of workspace
-     */
-    size_t get_working_size(unsigned int num_threads) const;
-
-    /** Was the asm kernel successfully configured?
-     *
-     * @return True if the asm kernel is configured and ready to run
-     */
-    bool is_configured() const;
-
-private:
-    /** Helper function to create the assembly kernel.
-     *
-     * @param[in] input  Input tensor info.
-     * @param[in] output Output tensor info.
-     * @param[in] info   Pooling layer meta-data.
-     */
-    template <typename TypeInput, typename TypeOutput>
-    void create_arm_pooling(const ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
-
-    /** Helper function to create the assembly kernel with requantization support
-     *
-     * @param[in] input  Input tensor info.
-     * @param[in] output Output tensor info.
-     * @param[in] info   Pooling layer meta-data.
-     */
-    template <typename TypeInput, typename TypeOutput>
-    void create_arm_pooling_requant(const ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
-
-    std::unique_ptr<arm_conv::pooling::IPoolingCommon> _kernel_asm{ nullptr };
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_ASSEMBLY_POOLING_KERNEL_WRAPPER_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp b/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp
new file mode 100644
index 0000000..19a0e90
--- /dev/null
+++ b/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+using namespace arm_compute::misc::shape_calculator;
+
+void CpuPoolingAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // dst initialization if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, info)));
+
+    const bool requantize = src->quantization_info() != dst->quantization_info();
+
+    switch(src->data_type())
+    {
+        case DataType::QASYMM8:
+            if(requantize)
+            {
+                create_arm_pooling_requant<uint8_t, uint8_t>(src, dst, info, cpu_info);
+            }
+            else
+            {
+                create_arm_pooling<uint8_t, uint8_t>(src, dst, info, cpu_info);
+            }
+            break;
+        case DataType::QASYMM8_SIGNED:
+            if(requantize)
+            {
+                create_arm_pooling_requant<int8_t, int8_t>(src, dst, info, cpu_info);
+            }
+            else
+            {
+                create_arm_pooling<int8_t, int8_t>(src, dst, info, cpu_info);
+            }
+            break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+            create_arm_pooling<float16_t, float16_t>(src, dst, info, cpu_info);
+            break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        case DataType::F32:
+            create_arm_pooling<float, float>(src, dst, info, cpu_info);
+            break;
+        default:
+            break;
+    }
+
+    Window win = calculate_max_window(*dst, Steps());
+    INEKernel::configure(win);
+}
+
+Status CpuPoolingAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+
+#ifndef __aarch64__
+    ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
+#endif /* __aarch64__ */
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((src->data_layout() != DataLayout::NHWC) || (info.data_layout != DataLayout::NHWC), "Only NHWC is supported by assembly kernels");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.pool_type != PoolingType::AVG) && (info.pool_type != PoolingType::MAX),
+                                    "Only AVG and MAX pooling are supported by assembly kernels");
+
+    if(dst->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+
+        const auto src_qinfo = src->quantization_info().uniform();
+        const auto dst_qinfo = dst->quantization_info().uniform();
+
+        if(src_qinfo != dst_qinfo)
+        {
+            const float multiplier = src_qinfo.scale / dst_qinfo.scale;
+            int32_t     dst_multiplier{};
+            int32_t     dst_shift{};
+            ARM_COMPUTE_RETURN_ERROR_ON(quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift));
+        }
+        else
+        {
+            if(src->data_type() == DataType::QASYMM8)
+            {
+                const bool has_padding = info.pad_stride_info.has_padding();
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
+            }
+        }
+    }
+    else
+    {
+        if(src->data_type() == DataType::QASYMM8)
+        {
+            // If dst is not configured, the quantization info are the same
+            const bool has_padding = info.pad_stride_info.has_padding();
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
+        }
+    }
+    return Status{};
+}
+
+void CpuPoolingAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get());
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_UNUSED(window);
+    ARM_COMPUTE_UNUSED(info);
+
+    ARM_COMPUTE_ERROR_ON(tensors.empty());
+
+    const ITensor *src       = tensors.get_const_tensor(TensorType::ACL_SRC);
+    ITensor       *dst       = tensors.get_tensor(TensorType::ACL_DST_0);
+    ITensor       *workspace = tensors.get_tensor(TensorType::ACL_DST_1);
+
+    const auto in_ptr        = src->buffer() + src->info()->offset_first_element_in_bytes();
+    auto       out_ptr       = dst->buffer() + dst->info()->offset_first_element_in_bytes();
+    auto       working_space = workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
+
+    const auto src_shape   = src->info()->tensor_shape();
+    const auto dst_shape   = dst->info()->tensor_shape();
+    const auto src_padding = src->info()->padding();
+    const auto dst_padding = dst->info()->padding();
+
+    const size_t ld_src_col   = src_shape[0] + src_padding.left + src_padding.right;
+    const size_t ld_src_row   = ld_src_col * (src_shape[1] + src_padding.top + src_padding.bottom);
+    const size_t ld_src_batch = ld_src_row * src_shape[2];
+    const size_t ld_dst_col   = dst_shape[0] + dst_padding.left + dst_padding.right;
+    const size_t ld_dst_row   = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom);
+    const size_t ld_dst_batch = ld_dst_row * dst_shape[2];
+
+    _kernel_asm->execute(in_ptr, ld_src_col, ld_src_row, ld_src_batch,
+                         out_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
+                         working_space, info.thread_id, info.num_threads);
+}
+
+size_t CpuPoolingAssemblyWrapperKernel::get_working_size(unsigned int num_threads) const
+{
+    return _kernel_asm->get_working_size(num_threads);
+}
+
+bool CpuPoolingAssemblyWrapperKernel::is_configured() const
+{
+    return _kernel_asm != nullptr;
+}
+
+template <typename Typesrc, typename Typedst>
+void CpuPoolingAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+{
+    const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
+
+    arm_conv::pooling::PoolingWindow window{};
+    window.cols = static_cast<unsigned int>(info.pool_size.x());
+    window.rows = static_cast<unsigned int>(info.pool_size.y());
+
+    arm_conv::pooling::PoolingStride stride{};
+    std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
+
+    const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
+
+    constexpr unsigned int idx_width    = 1;
+    constexpr unsigned int idx_height   = 2;
+    constexpr unsigned int idx_channels = 0;
+    constexpr unsigned int idx_batches  = 3;
+
+    const unsigned int n_batches  = src->dimension(idx_batches);
+    const unsigned int src_rows   = src->dimension(idx_height);
+    const unsigned int src_cols   = src->dimension(idx_width);
+    const unsigned int n_channels = src->dimension(idx_channels);
+    const unsigned int dst_rows   = dst->dimension(idx_height);
+    const unsigned int dst_cols   = dst->dimension(idx_width);
+
+    arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
+
+    // Configure assembly pooling kernel
+    auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst>(args);
+    if(pooling_kernel_asm == nullptr)
+    {
+        // Configuration not supported: Leave function unconfigured:
+        return;
+    }
+
+    _kernel_asm = std::move(pooling_kernel_asm);
+}
+
+template <typename Typesrc, typename Typedst>
+void CpuPoolingAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+{
+    const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
+
+    arm_conv::pooling::PoolingWindow window{};
+    window.cols = static_cast<unsigned int>(info.pool_size.x());
+    window.rows = static_cast<unsigned int>(info.pool_size.y());
+
+    arm_conv::pooling::PoolingStride stride{};
+    std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
+
+    const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
+
+    constexpr unsigned int idx_width    = 1;
+    constexpr unsigned int idx_height   = 2;
+    constexpr unsigned int idx_channels = 0;
+    constexpr unsigned int idx_batches  = 3;
+
+    const unsigned int n_batches  = src->dimension(idx_batches);
+    const unsigned int src_rows   = src->dimension(idx_height);
+    const unsigned int src_cols   = src->dimension(idx_width);
+    const unsigned int n_channels = src->dimension(idx_channels);
+    const unsigned int dst_rows   = dst->dimension(idx_height);
+    const unsigned int dst_cols   = dst->dimension(idx_width);
+
+    arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
+
+    const auto src_qinfo = src->quantization_info().uniform();
+    const auto dst_qinfo = dst->quantization_info().uniform();
+
+    const float multiplier = src_qinfo.scale / dst_qinfo.scale;
+    int32_t     dst_multiplier{};
+    int32_t     dst_shift{};
+    quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift);
+
+    const arm_conv::pooling::Requantize32 requant_args(src_qinfo.offset,
+                                                       dst_qinfo.offset,
+                                                       dst_shift, // left shift
+                                                       0,         // right shift
+                                                       dst_multiplier);
+
+    // Configure assembly pooling kernel with requantization
+    auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst, arm_conv::pooling::Requantize32>(args, requant_args);
+    if(pooling_kernel_asm == nullptr)
+    {
+        // Configuration not supported: Leave function unconfigured:
+        return;
+    }
+
+    _kernel_asm = std::move(pooling_kernel_asm);
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h b/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h
new file mode 100644
index 0000000..34ec452
--- /dev/null
+++ b/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_POOLING_ASSEMBLY_WRAPPER_KERNEL_H
+#define ARM_COMPUTE_CPU_POOLING_ASSEMBLY_WRAPPER_KERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/kernels/assembly/pooling.hpp"
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
+
+#include "pool_common.hpp"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** This class is a wrapper for the assembly kernels.
+  *
+  * Some kernels were written in assembly and highly optimised for specific
+  * CPUs like A53 or A55. The arm compute library creates an instance of
+  * CpuPoolingAssemblyWrapperKernel and other auxiliary data structures to
+  * execute a single assembly kernel in the context of an NEFunction.
+  *
+  */
+class CpuPoolingAssemblyWrapperKernel final : public ICpuKernel
+{
+public:
+    /** Constructor
+     */
+    CpuPoolingAssemblyWrapperKernel()                                   = default;
+    CpuPoolingAssemblyWrapperKernel(CpuPoolingAssemblyWrapperKernel &)  = delete;
+    CpuPoolingAssemblyWrapperKernel(CpuPoolingAssemblyWrapperKernel &&) = default;
+    CpuPoolingAssemblyWrapperKernel &operator=(CpuPoolingAssemblyWrapperKernel &) = delete;
+
+    const char *name() const override
+    {
+        return "CpuPoolingAssemblyWrapperKernel";
+    }
+
+    /** Initialise the kernel's src and dst.
+     *
+     * @param[in]  src      Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out] dst      Destination tensor info to store the result of pooling. Data types supported: same as @p src.
+     * @param[in]  info     Pooling meta-data.
+     * @param[in]  cpu_info CPU information needed to select the most appropriate kernel.
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
+
+    /** Indicates whether or not this function can be used to process the given parameters.
+     *
+     * @param[in] src  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] dst  Destination tensor to store the result of pooling. Data types supported: same as @p src.
+     * @param[in] info Pooling meta-data
+     *
+     * @return a status.
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+
+    /** Get size of the workspace needed by the assembly kernel.
+     *
+     * @param[in] num_threads Maximum number of threads that are going to be spawned.
+     *
+     * @return size of workspace
+     */
+    size_t get_working_size(unsigned int num_threads) const;
+
+    /** Was the asm kernel successfully configured?
+     *
+     * @return True if the asm kernel is configured and ready to run
+     */
+    bool is_configured() const;
+
+private:
+    /** Helper function to create the assembly kernel.
+     *
+     * @param[in] src  Source tensor info.
+     * @param[in] dst  Destination tensor info.
+     * @param[in] info Pooling layer meta-data.
+     */
+    template <typename Typesrc, typename Typedst>
+    void create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
+
+    /** Helper function to create the assembly kernel with requantization support
+     *
+     * @param[in] src  Source tensor info.
+     * @param[in] dst  Destination tensor info.
+     * @param[in] info Pooling layer meta-data.
+     */
+    template <typename Typesrc, typename Typedst>
+    void create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
+
+    std::unique_ptr<arm_conv::pooling::IPoolingCommon> _kernel_asm{ nullptr };
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_POOLING_ASSEMBLY_WRAPPER_KERNEL_H */
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/cpu/kernels/CpuPoolingKernel.cpp
similarity index 71%
rename from src/core/NEON/kernels/NEPoolingLayerKernel.cpp
rename to src/core/cpu/kernels/CpuPoolingKernel.cpp
index b46843b..a29aef4 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/cpu/kernels/CpuPoolingKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,13 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/NEON/kernels/NEPoolingLayerKernel.h"
+#include "src/core/cpu/kernels/CpuPoolingKernel.h"
 
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
@@ -41,16 +38,14 @@
 #include "support/ToolchainSupport.h"
 
 #include "src/core/NEON/wrapper/wrapper.h"
-#include <algorithm>
 #include <arm_neon.h>
-#include <cmath>
-#include <limits>
-#include <set>
-#include <string>
-#include <tuple>
 
 namespace arm_compute
 {
+namespace cpu
+{
+namespace kernels
+{
 using namespace misc::shape_calculator;
 
 namespace
@@ -138,10 +133,10 @@
     v = wrapper::vsetlane(elems[7], v, 7);
 }
 
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info,
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info,
                           unsigned int &pooled_w, unsigned int pooled_h, const ITensorInfo *indices, Size2D pool_size)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
 
     int                 pool_stride_x   = 0;
     int                 pool_stride_y   = 0;
@@ -149,25 +144,25 @@
     const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
 
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
     if(indices)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32, DataType::F16);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(input->data_type()));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(input->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
-                                    && (input->data_layout() == DataLayout::NHWC),
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(src->data_type()));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(src->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
+                                    && (src->data_layout() == DataLayout::NHWC),
                                     "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
 
-    if(output->total_size() != 0)
+    if(dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
-                                    || (output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON((dst->dimension(get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
+                                    || (dst->dimension(get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
 
         if(indices)
         {
@@ -188,29 +183,29 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *indices, const PoolingLayerInfo &pool_info,
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, ITensorInfo *indices, const PoolingLayerInfo &pool_info,
                                                         unsigned int &num_elems_processed_per_iteration,
                                                         BorderSize   &border_size,
                                                         unsigned int pooled_w, unsigned int pooled_h, int pool_size_x, int pool_size_y)
 {
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_pool_shape(*input, pool_info)));
+    // dst auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, pool_info)));
     if(indices)
     {
         // Indices auto inizialitation if not yet initialized
-        auto_init_if_empty(*indices, (input->clone()->set_tensor_shape(compute_pool_shape(*input,
-                                                                                          pool_info)))
+        auto_init_if_empty(*indices, (src->clone()->set_tensor_shape(compute_pool_shape(*src,
+                                                                                        pool_info)))
                            .set_data_type(DataType::U32) /* we store the offset to the element */);
     }
-    const auto          data_layout                  = pool_info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : pool_info.data_layout;
+    const auto          data_layout                  = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
     unsigned int        num_elems_read_per_iteration = 0;
     unsigned int        num_elems_horizontal_window  = 0;
     int                 pool_stride_x                = 0;
     int                 pool_stride_y                = 0;
     const int           idx_width                    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const int           idx_height                   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int           input_width                  = input->dimension(idx_width);
-    const int           input_height                 = input->dimension(idx_height);
+    const int           src_width                    = src->dimension(idx_width);
+    const int           src_height                   = src->dimension(idx_height);
     const PadStrideInfo pad_stride_info              = pool_info.pad_stride_info;
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
     const int  pool_pad_right  = pad_stride_info.pad_right();
@@ -219,9 +214,9 @@
     const int  pool_pad_bottom = pad_stride_info.pad_bottom();
     const bool is_square       = pool_size_x == pool_size_y;
 
-    // Check output dimensions
-    std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
-                                                     input->dimension(idx_height),
+    // Check dst dimensions
+    std::tie(pooled_w, pooled_h) = scaled_dimensions(src->dimension(idx_width),
+                                                     src->dimension(idx_height),
                                                      pool_size_x,
                                                      pool_size_y,
                                                      pad_stride_info);
@@ -233,7 +228,7 @@
 
     if(is_square)
     {
-        switch(input->data_type())
+        switch(src->data_type())
         {
             case DataType::QASYMM8:
             case DataType::QASYMM8_SIGNED:
@@ -299,28 +294,28 @@
         // Number of iterations in X dimension
         const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
         // Upper limit for the number of right/bottom border elements that are accessed
-        const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
-        const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
+        const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - src_width;
+        const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - src_height;
         border_size             = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
         border_size.right       = std::max(upper_bound_w, pool_pad_right);
         border_size.bottom      = std::max(upper_bound_h, pool_pad_bottom);
-        TensorShape output_shape{ input->tensor_shape() };
-        output_shape.set(0, pooled_w);
-        output_shape.set(1, pooled_h);
-        TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
-        win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
-        AccessWindowStatic     input_access(input, -pool_pad_left, -pool_pad_top, input_width + border_size.right, input_height + border_size.bottom);
-        AccessWindowHorizontal output_access(output, 0, num_elems_horizontal_window);
+        TensorShape dst_shape{ src->tensor_shape() };
+        dst_shape.set(0, pooled_w);
+        dst_shape.set(1, pooled_h);
+        TensorInfo dst_info(src->clone()->set_tensor_shape(dst_shape));
+        win = calculate_max_window(dst_info, Steps(num_elems_processed_per_iteration));
+        AccessWindowStatic     src_access(src, -pool_pad_left, -pool_pad_top, src_width + border_size.right, src_height + border_size.bottom);
+        AccessWindowHorizontal dst_access(dst, 0, num_elems_horizontal_window);
         if(indices)
         {
             AccessWindowHorizontal indices_access(indices, 0, num_elems_horizontal_window);
-            window_changed = update_window_and_padding(win, input_access, output_access, indices_access);
+            window_changed = update_window_and_padding(win, src_access, dst_access, indices_access);
         }
         else
         {
-            window_changed = update_window_and_padding(win, input_access, output_access);
+            window_changed = update_window_and_padding(win, src_access, dst_access);
         }
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+        dst_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape()));
     }
 
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
@@ -438,79 +433,71 @@
 
 } // namespace
 
-NEPoolingLayerKernel::NEPoolingLayerKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _indices(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false)
-{
-}
-
-BorderSize NEPoolingLayerKernel::border_size() const
+BorderSize CpuPoolingKernel::border_size() const
 {
     return _border_size;
 }
 
-void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info, ITensor *indices)
+void CpuPoolingKernel::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     const PadStrideInfo pad_stride_info   = pool_info.pad_stride_info;
     const bool          is_global_pooling = pool_info.is_global_pooling;
     const int           pool_stride_x     = pad_stride_info.stride().first;
 
     // Get data layout
-    const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : pool_info.data_layout;
+    const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
     const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
     // Update pool size in case of global pooling
     const Size2D pool_size(
-        is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size.width,
-        is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size.height);
+        is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
+        is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height);
 
     // Validate pool info before calling scaled_dimensions
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size.x(), pool_size.y()));
 
-    // Check output dimensions
+    // Check dst dimensions
     unsigned int pooled_w;
     unsigned int pooled_h;
-    std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(idx_width),
-                                                     input->info()->dimension(idx_height),
+    std::tie(pooled_w, pooled_h) = scaled_dimensions(src->dimension(idx_width),
+                                                     src->dimension(idx_height),
                                                      pool_size.x(),
                                                      pool_size.y(),
                                                      pad_stride_info);
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h, (indices) ? indices->info() : nullptr, pool_size));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, pooled_w, pooled_h, indices, pool_size));
 
     // Set instance variables
-    _input       = input;
-    _output      = output;
-    _indices     = indices;
     _pool_info   = pool_info;
-    _data_layout = input->info()->data_layout();
+    _data_layout = src->data_layout();
     _is_square   = (pool_size.x() == pool_size.y());
 
     // Get data type
-    const DataType data_type = input->info()->data_type();
+    const DataType data_type = src->data_type();
     const bool     is_nchw   = _data_layout == DataLayout::NCHW;
 
     if(data_type == DataType::QASYMM8)
     {
         if(!is_nchw)
         {
-            _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
+            _func = &CpuPoolingKernel::poolingMxN_q8_nhwc<uint8_t>;
         }
         else
         {
             if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
             {
-                _func = &NEPoolingLayerKernel::pooling2_q8_nchw<uint8_t>;
+                _func = &CpuPoolingKernel::pooling2_q8_nchw<uint8_t>;
             }
             else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
             {
-                _func = &NEPoolingLayerKernel::pooling3_q8_nchw<uint8_t>;
+                _func = &CpuPoolingKernel::pooling3_q8_nchw<uint8_t>;
             }
             else
             {
-                _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<uint8_t>;
+                _func = &CpuPoolingKernel::poolingMxN_q8_nchw<uint8_t>;
             }
         }
     }
@@ -518,21 +505,21 @@
     {
         if(!is_nchw)
         {
-            _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
+            _func = &CpuPoolingKernel::poolingMxN_q8_nhwc<int8_t>;
         }
         else
         {
             if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
             {
-                _func = &NEPoolingLayerKernel::pooling2_q8_nchw<int8_t>;
+                _func = &CpuPoolingKernel::pooling2_q8_nchw<int8_t>;
             }
             else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
             {
-                _func = &NEPoolingLayerKernel::pooling3_q8_nchw<int8_t>;
+                _func = &CpuPoolingKernel::pooling3_q8_nchw<int8_t>;
             }
             else
             {
-                _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<int8_t>;
+                _func = &CpuPoolingKernel::poolingMxN_q8_nchw<int8_t>;
             }
         }
     }
@@ -540,7 +527,7 @@
     {
         if(!is_nchw)
         {
-            _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
+            _func = &CpuPoolingKernel::poolingMxN_f16_nhwc;
         }
         else
         {
@@ -550,24 +537,24 @@
                 {
                     case 2:
                     {
-                        _func = &NEPoolingLayerKernel::pooling2_f16_nchw;
+                        _func = &CpuPoolingKernel::pooling2_f16_nchw;
                     }
                     break;
                     case 3:
                     {
-                        _func = &NEPoolingLayerKernel::pooling3_f16_nchw;
+                        _func = &CpuPoolingKernel::pooling3_f16_nchw;
                     }
                     break;
                     default:
                     {
-                        _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
+                        _func = &CpuPoolingKernel::poolingMxN_f16_nchw;
                         break;
                     }
                 }
             }
             else
             {
-                _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
+                _func = &CpuPoolingKernel::poolingMxN_f16_nchw;
             }
         }
     }
@@ -575,7 +562,7 @@
     {
         if(!is_nchw)
         {
-            _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
+            _func = &CpuPoolingKernel::poolingMxN_f32_nhwc;
         }
         else
         {
@@ -585,29 +572,29 @@
                 {
                     case 2:
                     {
-                        _func = &NEPoolingLayerKernel::pooling2_f32_nchw;
+                        _func = &CpuPoolingKernel::pooling2_f32_nchw;
                         break;
                     }
                     case 3:
                     {
-                        _func = &NEPoolingLayerKernel::pooling3_f32_nchw;
+                        _func = &CpuPoolingKernel::pooling3_f32_nchw;
                         break;
                     }
                     case 7:
                     {
-                        _func = &NEPoolingLayerKernel::pooling7_f32_nchw;
+                        _func = &CpuPoolingKernel::pooling7_f32_nchw;
                         break;
                     }
                     default:
                     {
-                        _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
+                        _func = &CpuPoolingKernel::poolingMxN_f32_nchw;
                         break;
                     }
                 }
             }
             else
             {
-                _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
+                _func = &CpuPoolingKernel::poolingMxN_f32_nchw;
             }
         }
     }
@@ -615,19 +602,19 @@
     if(!is_nchw)
     {
         // Configure kernel window
-        Window      win = calculate_max_window(*output->info(), Steps());
+        Window      win = calculate_max_window(*dst, Steps());
         Coordinates coord;
-        coord.set_num_dimensions(output->info()->num_dimensions());
-        output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-        INEKernel::configure(win);
+        coord.set_num_dimensions(dst->num_dimensions());
+        dst->set_valid_region(ValidRegion(coord, dst->tensor_shape()));
+        ICpuKernel::configure(win);
     }
     else
     {
         // Configure kernel window
-        auto win_config = validate_and_configure_window(input->info(), output->info(), (indices) ? indices->info() : nullptr,
-                                                        pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size.x(), pool_size.y());
+        auto win_config = validate_and_configure_window(src, dst, indices, pool_info, _num_elems_processed_per_iteration,
+                                                        _border_size, pooled_w, pooled_h, pool_size.x(), pool_size.y());
         ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-        INEKernel::configure(win_config.second);
+        ICpuKernel::configure(win_config.second);
     }
 }
 
@@ -666,10 +653,10 @@
 }
 
 template <typename T>
-void NEPoolingLayerKernel::pooling2_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+void CpuPoolingKernel::pooling2_q8_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
-    Iterator input(_input, window_input);
-    Iterator output(_output, window);
+    Iterator src(_src, window_src);
+    Iterator dst(_dst, window);
 
     /** NEON vector types */
     using q8x8_t    = typename wrapper::traits::neon_vector<T, 8>::type;
@@ -688,26 +675,26 @@
     const int     pool_pad_left   = _pool_info.pad_stride_info.pad_left();
     const int     pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
-    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+    const int upper_bound_w = _src->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _src->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
 
-    const T *const input_top_ptr    = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
-    const T *const input_bottom_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
+    const T *const src_top_ptr    = reinterpret_cast<const T *>(_src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
+    const T *const src_bottom_ptr = reinterpret_cast<const T *>(_src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
 
     const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
 
-    const UniformQuantizationInfo input_qinfo          = _input->info()->quantization_info().uniform();
-    const UniformQuantizationInfo output_qinfo         = _output->info()->quantization_info().uniform();
-    const bool                    have_different_qinfo = input_qinfo != output_qinfo;
+    const UniformQuantizationInfo src_qinfo            = _src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo dst_qinfo            = _dst->info()->quantization_info().uniform();
+    const bool                    have_different_qinfo = src_qinfo != dst_qinfo;
 
-    const float                   requant_scale  = output_qinfo.scale / input_qinfo.scale;
-    const int32_t                 requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
+    const float                   requant_scale  = dst_qinfo.scale / src_qinfo.scale;
+    const int32_t                 requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
     const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        const auto top_data    = wrapper::vloadq(input_top_ptr + input.offset());
-        const auto bottom_data = wrapper::vloadq(input_bottom_ptr + input.offset());
+        const auto top_data    = wrapper::vloadq(src_top_ptr + src.offset());
+        const auto bottom_data = wrapper::vloadq(src_bottom_ptr + src.offset());
         q8x8_t     lower_res   = {};
         q8x8_t     upper_res   = {};
 
@@ -774,32 +761,32 @@
 
         if(have_different_qinfo)
         {
-            const auto requantized_output = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
-            lower_res                     = wrapper::vgetlow(requantized_output);
-            upper_res                     = wrapper::vgethigh(requantized_output);
+            const auto requantized_dst = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
+            lower_res                  = wrapper::vgetlow(requantized_dst);
+            upper_res                  = wrapper::vgethigh(requantized_dst);
         }
 
         // Store result
         if(pool_stride_x == 1)
         {
             const q8x8x2_t res = { { lower_res, upper_res } };
-            wrapper::vstore(reinterpret_cast<T *>(output.ptr()), res);
+            wrapper::vstore(reinterpret_cast<T *>(dst.ptr()), res);
         }
         else
         {
-            wrapper::vstore(reinterpret_cast<T *>(output.ptr()), lower_res);
+            wrapper::vstore(reinterpret_cast<T *>(dst.ptr()), lower_res);
         }
     },
-    input, output);
+    src, dst);
 }
 
-void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+void CpuPoolingKernel::pooling3_f16_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
     ARM_COMPUTE_UNUSED(pooling_type);
     ARM_COMPUTE_UNUSED(exclude_padding);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    Iterator input(_input, window_input);
-    Iterator output(_output, window);
+    Iterator src(_src, window_src);
+    Iterator dst(_dst, window);
 
     constexpr const int pool_size       = 3;
     const int           pool_pad_right  = _pool_info.pad_stride_info.pad_right();
@@ -809,18 +796,18 @@
     int                 pool_stride_x   = 0;
     int                 pool_stride_y   = 0;
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
-    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+    const int upper_bound_w = _src->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _src->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
 
-    const unsigned char *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-    const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
+    const unsigned char *const src_top_ptr    = _src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const unsigned char *const src_middle_ptr = _src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+    const unsigned char *const src_bottom_ptr = _src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        float16x4_t top_data    = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
-        float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
-        float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
+        float16x4_t top_data    = vld1_f16(reinterpret_cast<const float16_t *>(src_top_ptr + src.offset()));
+        float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(src_middle_ptr + src.offset()));
+        float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(src_bottom_ptr + src.offset()));
         float16x4_t res         = {};
 
         // Get power of 2 in case of l2 pooling
@@ -854,11 +841,11 @@
             res = vinv_f16(vinvsqrt_f16(res));
         }
 
-        *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
+        *(reinterpret_cast<float16_t *>(dst.ptr())) = vget_lane_f16(res, 0);
     },
-    input, output);
+    src, dst);
 #else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    ARM_COMPUTE_UNUSED(window_input);
+    ARM_COMPUTE_UNUSED(window_src);
     ARM_COMPUTE_UNUSED(window);
     ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
@@ -867,52 +854,52 @@
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 template <typename T>
 inline typename std::enable_if<std::is_same<T, float16_t>::value, float32x2_t>::type
-f16_to_f32(float16x4_t input)
+f16_to_f32(float16x4_t src)
 {
-    float32x2_t output = { static_cast<float>(vget_lane_f16(input, 0)), static_cast<float>(vget_lane_f16(input, 1)) };
-    return output;
+    float32x2_t dst = { static_cast<float>(vget_lane_f16(src, 0)), static_cast<float>(vget_lane_f16(src, 1)) };
+    return dst;
 }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 template <typename T>
 inline typename std::enable_if<std::is_same<T, float>::value, float32x2_t>::type
-f16_to_f32(float32x2_t input)
+f16_to_f32(float32x2_t src)
 {
-    return input;
+    return src;
 }
 
 template <typename T>
-void NEPoolingLayerKernel::pooling2_nchw_maxpool_indices(const Window &window_input, const Window &window)
+void CpuPoolingKernel::pooling2_nchw_maxpool_indices(const Window &window_src, const Window &window)
 {
-    Iterator  input(_input, window_input);
-    Iterator  output(_output, window);
+    Iterator  src(_src, window_src);
+    Iterator  dst(_dst, window);
     Iterator  indices(_indices, window);
     const int pool_pad_top  = _pool_info.pad_stride_info.pad_top();
     const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
     int       pool_stride_x = 0;
     int       pool_stride_y = 0;
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
-    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-    const int            pad_left         = _input->info()->padding().left;
-    const int            pad_right        = _input->info()->padding().right;
-    const int            in_stride_y      = static_cast<int>(_input->info()->strides_in_bytes().y());
+    const uint8_t *const src_top_ptr    = _src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const uint8_t *const src_bottom_ptr = _src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+    const int            pad_left       = _src->info()->padding().left;
+    const int            pad_right      = _src->info()->padding().right;
+    const int            in_stride_y    = static_cast<int>(_src->info()->strides_in_bytes().y());
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        auto        top_data        = wrapper::vload(reinterpret_cast<const T *>(input_top_ptr + input.offset()));
-        auto        bottom_data     = wrapper::vload(reinterpret_cast<const T *>(input_bottom_ptr + input.offset()));
+        auto        top_data        = wrapper::vload(reinterpret_cast<const T *>(src_top_ptr + src.offset()));
+        auto        bottom_data     = wrapper::vload(reinterpret_cast<const T *>(src_bottom_ptr + src.offset()));
         float32x2_t top_data_f32    = f16_to_f32<T>(top_data);
         float32x2_t bottom_data_f32 = f16_to_f32<T>(bottom_data);
 
         // Calculate max data, compare top first, then bottom, to make sue the first max is recorded.
-        const float32x2_t max_data_top         = vpmax_f32(top_data_f32, top_data_f32);
-        const float32x2_t max_data_bottom      = vpmax_f32(bottom_data_f32, bottom_data_f32);
-        const float32x2_t max_data             = vmax_f32(max_data_top, max_data_bottom);
-        *(reinterpret_cast<T *>(output.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0));
+        const float32x2_t max_data_top      = vpmax_f32(top_data_f32, top_data_f32);
+        const float32x2_t max_data_bottom   = vpmax_f32(bottom_data_f32, bottom_data_f32);
+        const float32x2_t max_data          = vmax_f32(max_data_top, max_data_bottom);
+        *(reinterpret_cast<T *>(dst.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0));
 
         // Calculate max data indice, which will be used in max unpool.
-        const uint32_t   offset_base              = offset_no_padding<T>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
+        const uint32_t   offset_base              = offset_no_padding<T>(src.offset(), id, *_src->info(), pool_stride_x, pool_stride_y);
         const uint32_t   offset_top               = (uint32_t)(offset_base / sizeof(T));
         const uint32_t   offset_bottom            = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left;
         const uint32x2_t voffset_top              = { offset_top, offset_top + 1u };
@@ -921,22 +908,22 @@
         const uint32x2_t tmp_indices_bottom       = vbsl_u32(vcge_f32(bottom_data_f32, vrev64_f32(bottom_data_f32)), voffset_bottom, vrev64_u32(voffset_bottom));
         *(reinterpret_cast<int *>(indices.ptr())) = vget_lane_u32(vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0);
     },
-    input, output, indices);
+    src, dst, indices);
 }
 
-void NEPoolingLayerKernel::pooling2_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+void CpuPoolingKernel::pooling2_f16_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
     ARM_COMPUTE_UNUSED(pooling_type);
     ARM_COMPUTE_UNUSED(exclude_padding);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     if(pooling_type == PoolingType::MAX && _indices)
     {
-        pooling2_nchw_maxpool_indices<float16_t>(window_input, window);
+        pooling2_nchw_maxpool_indices<float16_t>(window_src, window);
     }
     else
     {
-        Iterator      input(_input, window_input);
-        Iterator      output(_output, window);
+        Iterator      src(_src, window_src);
+        Iterator      dst(_dst, window);
         constexpr int pool_size       = 2;
         const int     pool_pad_right  = _pool_info.pad_stride_info.pad_right();
         const int     pool_pad_top    = _pool_info.pad_stride_info.pad_top();
@@ -944,16 +931,16 @@
         const int     pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
         int           pool_stride_x, pool_stride_y = 0;
         std::tie(pool_stride_x, pool_stride_y)     = _pool_info.pad_stride_info.stride();
-        const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
-        const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+        const int upper_bound_w = _src->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+        const int upper_bound_h = _src->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
 
-        const unsigned char *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-        const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+        const unsigned char *const src_top_ptr    = _src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+        const unsigned char *const src_bottom_ptr = _src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
 
         execute_window_loop(window, [&](const Coordinates & id)
         {
-            float16x4_t top_data    = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
-            float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
+            float16x4_t top_data    = vld1_f16(reinterpret_cast<const float16_t *>(src_top_ptr + src.offset()));
+            float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(src_bottom_ptr + src.offset()));
             float16x4_t res         = {};
 
             // Get power of 2 in case of l2 pooling
@@ -984,22 +971,22 @@
             }
 
             // Store result
-            *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
+            *(reinterpret_cast<float16_t *>(dst.ptr())) = vget_lane_f16(res, 0);
         },
-        input, output);
+        src, dst);
     }
 #else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    ARM_COMPUTE_UNUSED(window_input);
+    ARM_COMPUTE_UNUSED(window_src);
     ARM_COMPUTE_UNUSED(window);
     ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
 
 template <typename T>
-void NEPoolingLayerKernel::pooling3_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+void CpuPoolingKernel::pooling3_q8_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
-    Iterator input(_input, window_input);
-    Iterator output(_output, window);
+    Iterator src(_src, window_src);
+    Iterator dst(_dst, window);
 
     /** NEON vector types */
     using q8x8_t    = typename wrapper::traits::neon_vector<T, 8>::type;
@@ -1017,25 +1004,25 @@
     int           pool_stride_x   = 0;
     int           pool_stride_y   = 0;
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
-    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+    const int upper_bound_w = _src->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _src->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
 
-    const UniformQuantizationInfo &input_qinfo  = _input->info()->quantization_info().uniform();
-    const UniformQuantizationInfo &output_qinfo = _output->info()->quantization_info().uniform();
+    const UniformQuantizationInfo &src_qinfo = _src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo &dst_qinfo = _dst->info()->quantization_info().uniform();
 
-    const float                   requant_scale  = output_qinfo.scale / input_qinfo.scale;
-    const int32_t                 requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
+    const float                   requant_scale  = dst_qinfo.scale / src_qinfo.scale;
+    const int32_t                 requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
     const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
 
-    const T *const input_top_ptr    = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
-    const T *const input_middle_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
-    const T *const input_bottom_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
+    const T *const src_top_ptr    = reinterpret_cast<const T *>(_src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
+    const T *const src_middle_ptr = reinterpret_cast<const T *>(_src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
+    const T *const src_bottom_ptr = reinterpret_cast<const T *>(_src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        const auto top_data    = wrapper::vloadq(input_top_ptr + input.offset());
-        const auto middle_data = wrapper::vloadq(input_middle_ptr + input.offset());
-        const auto bottom_data = wrapper::vloadq(input_bottom_ptr + input.offset());
+        const auto top_data    = wrapper::vloadq(src_top_ptr + src.offset());
+        const auto middle_data = wrapper::vloadq(src_middle_ptr + src.offset());
+        const auto bottom_data = wrapper::vloadq(src_bottom_ptr + src.offset());
         q8x8_t     fres        = {};
         q8x16_t    fqres       = {};
 
@@ -1130,34 +1117,34 @@
         // Store result
         if(pool_stride_x == 1)
         {
-            if(input_qinfo != output_qinfo)
+            if(src_qinfo != dst_qinfo)
             {
                 fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), requant_qinfo);
             }
-            wrapper::vstore(reinterpret_cast<T *>(output.ptr()), fqres);
+            wrapper::vstore(reinterpret_cast<T *>(dst.ptr()), fqres);
         }
         else
         {
-            if(input_qinfo != output_qinfo)
+            if(src_qinfo != dst_qinfo)
             {
                 fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
             }
-            wrapper::vstore(reinterpret_cast<T *>(output.ptr()), fres);
+            wrapper::vstore(reinterpret_cast<T *>(dst.ptr()), fres);
         }
     },
-    input, output);
+    src, dst);
 }
 
-void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+void CpuPoolingKernel::poolingMxN_f16_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
     ARM_COMPUTE_UNUSED(pooling_type);
     ARM_COMPUTE_UNUSED(exclude_padding);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    Iterator input(_input, window_input);
-    Iterator output(_output, window);
+    Iterator src(_src, window_src);
+    Iterator dst(_dst, window);
 
-    const int pool_size_x     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
-    const int pool_size_y     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
+    const int pool_size_x     = _pool_info.is_global_pooling ? _src->info()->tensor_shape().x() : _pool_info.pool_size.width;
+    const int pool_size_y     = _pool_info.is_global_pooling ? _src->info()->tensor_shape().y() : _pool_info.pool_size.height;
     const int pool_pad_right  = _pool_info.pad_stride_info.pad_right();
     const int pool_pad_top    = _pool_info.pad_stride_info.pad_top();
     const int pool_pad_left   = _pool_info.pad_stride_info.pad_left();
@@ -1165,8 +1152,8 @@
     int       pool_stride_x   = 0;
     int       pool_stride_y   = 0;
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
-    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+    const int upper_bound_w = _src->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _src->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
@@ -1185,8 +1172,8 @@
                 int x = 0;
                 for(; x <= (pool_size_x - 8); x += 8)
                 {
-                    const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) +
-                                                                                           (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
+                    const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
+                                                                                           (_src->info()->strides_in_bytes().y())));
 
                     // Get power of 2 in case of l2 pooling and accumulate
                     if(pooling_type == PoolingType::L2)
@@ -1202,8 +1189,8 @@
                 // Leftover for loop
                 for(; x < pool_size_x; ++x)
                 {
-                    float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x())
-                                                                           + (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
+                    float16_t data = *(reinterpret_cast<const float16_t *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().x())
+                                                                           + (y - pool_pad_top) * static_cast<int>(_src->info()->strides_in_bytes().y())));
 
                     // Get power of 2 in case of l2 pooling
                     if(pooling_type == PoolingType::L2)
@@ -1235,16 +1222,16 @@
                 int x = 0;
                 for(; x <= (pool_size_x - 8); x += 8)
                 {
-                    const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) +
-                                                                                           (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
+                    const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
+                                                                                           (_src->info()->strides_in_bytes().y())));
                     vres                   = vmaxq_f16(vres, data);
                 }
 
                 // Leftover for loop
                 for(; x < pool_size_x; ++x)
                 {
-                    const float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x())
-                                                                                 + (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
+                    const float16_t data = *(reinterpret_cast<const float16_t *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().x())
+                                                                                 + (y - pool_pad_top) * static_cast<int>(_src->info()->strides_in_bytes().y())));
                     res = std::max(res, data);
                 }
             }
@@ -1263,19 +1250,19 @@
         }
 
         // Store result
-        *(reinterpret_cast<float16_t *>(output.ptr())) = res;
+        *(reinterpret_cast<float16_t *>(dst.ptr())) = res;
     },
-    input, output);
+    src, dst);
 
 #else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    ARM_COMPUTE_UNUSED(window_input);
+    ARM_COMPUTE_UNUSED(window_src);
     ARM_COMPUTE_UNUSED(window);
     ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void NEPoolingLayerKernel::pooling2_f16_nhwc_maxpool_indices(const Window &window_input, const Window &window)
+void CpuPoolingKernel::pooling2_f16_nhwc_maxpool_indices(const Window &window_src, const Window &window)
 {
     const int window_start_x = window.x().start();
     const int window_end_x   = window.x().end();
@@ -1284,8 +1271,8 @@
     Window window_out = window;
     window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    Iterator input(_input, window_input);
-    Iterator output(_output, window_out);
+    Iterator src(_src, window_src);
+    Iterator dst(_dst, window_out);
     Iterator indices(_indices, window_out);
 
     const int pool_pad_top  = _pool_info.pad_stride_info.pad_top();
@@ -1295,9 +1282,9 @@
     int pool_stride_y = 0;
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
 
-    const int pad_right   = _input->info()->padding().right;
-    const int in_stride_y = static_cast<int>(_input->info()->strides_in_bytes().y());
-    const int in_stride_z = static_cast<int>(_input->info()->strides_in_bytes().z());
+    const int pad_right   = _src->info()->padding().right;
+    const int in_stride_y = static_cast<int>(_src->info()->strides_in_bytes().y());
+    const int in_stride_z = static_cast<int>(_src->info()->strides_in_bytes().z());
 
     execute_window_loop(window_out, [&](const Coordinates & id)
     {
@@ -1306,36 +1293,36 @@
         const int pool_limit_y = pool_pad_top - idx_height;
         const int pool_limit_x = pool_pad_left - idx_width;
 
-        const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
-        const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
-        const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
-                                 (_input->info()->strides_in_bytes().z());
-        const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
-                                 (_input->info()->strides_in_bytes().z());
-        const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
-                                 (_input->info()->strides_in_bytes().z());
-        const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
-                                 (_input->info()->strides_in_bytes().z());
+        const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+        const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+        const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
+                                 (_src->info()->strides_in_bytes().z());
+        const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
+                                 (_src->info()->strides_in_bytes().z());
+        const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
+                                 (_src->info()->strides_in_bytes().z());
+        const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
+                                 (_src->info()->strides_in_bytes().z());
 
         int x_off = window_start_x;
         for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
         {
-            const auto  in_x0_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x0_offset) + x_off;
-            const auto  in_x1_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x1_offset) + x_off;
-            const auto  in_x2_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x2_offset) + x_off;
-            const auto  in_x3_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x3_offset) + x_off;
+            const auto  in_x0_ptr = reinterpret_cast<const float16_t *>(src.ptr() + in_x0_offset) + x_off;
+            const auto  in_x1_ptr = reinterpret_cast<const float16_t *>(src.ptr() + in_x1_offset) + x_off;
+            const auto  in_x2_ptr = reinterpret_cast<const float16_t *>(src.ptr() + in_x2_offset) + x_off;
+            const auto  in_x3_ptr = reinterpret_cast<const float16_t *>(src.ptr() + in_x3_offset) + x_off;
             const auto  v_x0      = vld1q_f16(in_x0_ptr);
             const auto  v_x1      = vld1q_f16(in_x1_ptr);
             const auto  v_x2      = vld1q_f16(in_x2_ptr);
             const auto  v_x3      = vld1q_f16(in_x3_ptr);
             float16x8_t vres      = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1));
             // Store result
-            vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + x_off, vres);
+            vst1q_f16(reinterpret_cast<float16_t *>(dst.ptr()) + x_off, vres);
 
-            const uint32_t   offset_base    = offset_no_padding<float16_t>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
+            const uint32_t   offset_base    = offset_no_padding<float16_t>(src.offset(), id, *_src->info(), pool_stride_x, pool_stride_y);
             const uint32_t   offset_x0      = (uint32_t)offset_base / sizeof(float16_t) + x_off;
             const uint32_t   offset_x1      = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_right;
-            const uint32_t   offset_x2      = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_right * _input->info()->tensor_shape()[1];
+            const uint32_t   offset_x2      = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_right * _src->info()->tensor_shape()[1];
             const uint32_t   offset_x3      = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_right;
             const uint32x4_t voffset_x0_0   = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
             const uint32x4_t voffset_x0_1   = { offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 };
@@ -1362,19 +1349,19 @@
         // Left-overs loop
         for(; x_off < window_end_x; ++x_off)
         {
-            const auto x0  = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x0_offset) + x_off);
-            const auto x1  = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x1_offset) + x_off);
-            const auto x2  = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x2_offset) + x_off);
-            const auto x3  = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x3_offset) + x_off);
+            const auto x0  = *(reinterpret_cast<const float16_t *>(src.ptr() + in_x0_offset) + x_off);
+            const auto x1  = *(reinterpret_cast<const float16_t *>(src.ptr() + in_x1_offset) + x_off);
+            const auto x2  = *(reinterpret_cast<const float16_t *>(src.ptr() + in_x2_offset) + x_off);
+            const auto x3  = *(reinterpret_cast<const float16_t *>(src.ptr() + in_x3_offset) + x_off);
             float16_t  res = std::max(std::max(x2, x3), std::max(x0, x1));
 
             // Store result
-            *(reinterpret_cast<float16_t *>(output.ptr()) + x_off) = res;
+            *(reinterpret_cast<float16_t *>(dst.ptr()) + x_off) = res;
 
-            const uint32_t offset_base = offset_no_padding<float16_t>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
+            const uint32_t offset_base = offset_no_padding<float16_t>(src.offset(), id, *_src->info(), pool_stride_x, pool_stride_y);
             const uint32_t offset_x0   = (uint32_t)offset_base / sizeof(float16_t) + x_off;
             const uint32_t offset_x1   = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_right;
-            const uint32_t offset_x2   = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_right * _input->info()->tensor_shape()[1];
+            const uint32_t offset_x2   = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_right * _src->info()->tensor_shape()[1];
             const uint32_t offset_x3   = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_right;
             const uint32_t tmp_idx0    = (x0 >= x1) ? offset_x0 : offset_x1;
             const uint32_t tmp_idx1    = (x2 >= x3) ? offset_x2 : offset_x3;
@@ -1384,18 +1371,18 @@
             *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
         }
     },
-    input, output, indices);
+    src, dst, indices);
 }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
-void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+void CpuPoolingKernel::poolingMxN_f16_nhwc(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
     ARM_COMPUTE_UNUSED(pooling_type);
     ARM_COMPUTE_UNUSED(exclude_padding);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     if(_pool_info.pool_size == Size2D(2, 2) && pooling_type == PoolingType::MAX && _indices)
     {
-        pooling2_f16_nhwc_maxpool_indices(window_input, window);
+        pooling2_f16_nhwc_maxpool_indices(window_src, window);
     }
     const int window_start_x = window.x().start();
     const int window_end_x   = window.x().end();
@@ -1404,11 +1391,11 @@
     Window window_out = window;
     window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    Iterator input(_input, window_input);
-    Iterator output(_output, window_out);
+    Iterator src(_src, window_src);
+    Iterator dst(_dst, window_out);
 
-    const int pool_size_x     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
-    const int pool_size_y     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
+    const int pool_size_x     = _pool_info.is_global_pooling ? _src->info()->tensor_shape().y() : _pool_info.pool_size.width;
+    const int pool_size_y     = _pool_info.is_global_pooling ? _src->info()->tensor_shape().z() : _pool_info.pool_size.height;
     const int pool_pad_right  = _pool_info.pad_stride_info.pad_right();
     const int pool_pad_top    = _pool_info.pad_stride_info.pad_top();
     const int pool_pad_left   = _pool_info.pad_stride_info.pad_left();
@@ -1416,8 +1403,8 @@
     int       pool_stride_x   = 0;
     int       pool_stride_y   = 0;
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
-    const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
+    const int upper_bound_w = _src->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _src->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
 
     float16x8_t vres;
 
@@ -1428,10 +1415,10 @@
         const int pool_limit_y = pool_pad_top - idx_height;
         const int pool_limit_x = pool_pad_left - idx_width;
 
-        const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
-        const int pool_end_y   = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
-        const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
-        const int pool_end_x   = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
+        const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+        const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
+        const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+        const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
 
         int x_off = window_start_x;
         for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
@@ -1449,8 +1436,8 @@
                 {
                     for(int x = pool_start_x; x < pool_end_x; ++x)
                     {
-                        const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
-                                                                                               (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())) + x_off);
+                        const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+                                                                                               (_src->info()->strides_in_bytes().z())) + x_off);
 
                         // Get power of 2 in case of l2 pooling and accumulate
                         if(pooling_type == PoolingType::L2)
@@ -1474,8 +1461,8 @@
                 {
                     for(int x = pool_start_x; x < pool_end_x; ++x)
                     {
-                        const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
-                                                                                               (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())) + x_off);
+                        const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+                                                                                               (_src->info()->strides_in_bytes().z())) + x_off);
                         vres                   = vmaxq_f16(vres, data);
                     }
                 }
@@ -1489,7 +1476,7 @@
             }
 
             // Store result
-            vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + x_off, vres);
+            vst1q_f16(reinterpret_cast<float16_t *>(dst.ptr()) + x_off, vres);
         }
 
         // Left-overs loop
@@ -1507,8 +1494,8 @@
                 {
                     for(int x = pool_start_x; x < pool_end_x; ++x)
                     {
-                        const float data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                 (_input->info()->strides_in_bytes().z())) + x_off);
+                        const float data = *(reinterpret_cast<const float16_t *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+                                                                                 (_src->info()->strides_in_bytes().z())) + x_off);
 
                         // Get power of 2 in case of l2 pooling and accumulate
                         if(pooling_type == PoolingType::L2)
@@ -1532,8 +1519,8 @@
                 {
                     for(int x = pool_start_x; x < pool_end_x; ++x)
                     {
-                        const float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                     (_input->info()->strides_in_bytes().z())) + x_off);
+                        const float16_t data = *(reinterpret_cast<const float16_t *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+                                                                                     (_src->info()->strides_in_bytes().z())) + x_off);
                         res                  = std::max(res, data);
                     }
                 }
@@ -1546,25 +1533,25 @@
             }
 
             // Store result
-            *(reinterpret_cast<float16_t *>(output.ptr()) + x_off) = res;
+            *(reinterpret_cast<float16_t *>(dst.ptr()) + x_off) = res;
         }
     },
-    input, output);
+    src, dst);
 
 #else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    ARM_COMPUTE_UNUSED(window_input);
+    ARM_COMPUTE_UNUSED(window_src);
     ARM_COMPUTE_UNUSED(window);
     ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
 
-void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+void CpuPoolingKernel::poolingMxN_f32_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
-    Iterator input(_input, window_input);
-    Iterator output(_output, window);
+    Iterator src(_src, window_src);
+    Iterator dst(_dst, window);
 
-    const int pool_size_x     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
-    const int pool_size_y     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
+    const int pool_size_x     = _pool_info.is_global_pooling ? _src->info()->tensor_shape().x() : _pool_info.pool_size.width;
+    const int pool_size_y     = _pool_info.is_global_pooling ? _src->info()->tensor_shape().y() : _pool_info.pool_size.height;
     const int pool_pad_right  = _pool_info.pad_stride_info.pad_right();
     const int pool_pad_top    = _pool_info.pad_stride_info.pad_top();
     const int pool_pad_left   = _pool_info.pad_stride_info.pad_left();
@@ -1572,8 +1559,8 @@
     int       pool_stride_x   = 0;
     int       pool_stride_y   = 0;
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
-    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+    const int upper_bound_w = _src->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _src->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
@@ -1592,8 +1579,8 @@
                 int x = 0;
                 for(; x <= (pool_size_x - 4); x += 4)
                 {
-                    const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                                                       (_input->info()->strides_in_bytes().y())));
+                    const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
+                                                                                       (_src->info()->strides_in_bytes().y())));
 
                     // Get power of 2 in case of l2 pooling and accumulate
                     if(pooling_type == PoolingType::L2)
@@ -1609,8 +1596,8 @@
                 // Leftover for loop
                 for(; x < pool_size_x; ++x)
                 {
-                    float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                                   (_input->info()->strides_in_bytes().y())));
+                    float data = *(reinterpret_cast<const float *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
+                                                                   (_src->info()->strides_in_bytes().y())));
 
                     // Get power of 2 in case of l2 pooling
                     if(pooling_type == PoolingType::L2)
@@ -1645,16 +1632,16 @@
                 int x = 0;
                 for(; x <= (pool_size_x - 4); x += 4)
                 {
-                    const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                                                       (_input->info()->strides_in_bytes().y())));
+                    const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
+                                                                                       (_src->info()->strides_in_bytes().y())));
                     vres                   = vmaxq_f32(vres, data);
                 }
 
                 // Leftover for loop
                 for(; x < pool_size_x; ++x)
                 {
-                    const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                                         (_input->info()->strides_in_bytes().y())));
+                    const float data = *(reinterpret_cast<const float *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
+                                                                         (_src->info()->strides_in_bytes().y())));
                     res              = std::max(res, data);
                 }
             }
@@ -1676,22 +1663,22 @@
         }
 
         // Store result
-        *(reinterpret_cast<float *>(output.ptr())) = res;
+        *(reinterpret_cast<float *>(dst.ptr())) = res;
     },
-    input, output);
+    src, dst);
 }
 
-void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type,
-                                             bool exclude_padding)
+void CpuPoolingKernel::pooling2_f32_nchw(const Window &window_src, const Window &window, PoolingType pooling_type,
+                                         bool exclude_padding)
 {
     if(pooling_type == PoolingType::MAX && _indices)
     {
-        pooling2_nchw_maxpool_indices<float>(window_input, window);
+        pooling2_nchw_maxpool_indices<float>(window_src, window);
     }
     else
     {
-        Iterator      input(_input, window_input);
-        Iterator      output(_output, window);
+        Iterator      src(_src, window_src);
+        Iterator      dst(_dst, window);
         constexpr int pool_size       = 2;
         const int     pool_pad_right  = _pool_info.pad_stride_info.pad_right();
         const int     pool_pad_top    = _pool_info.pad_stride_info.pad_top();
@@ -1700,16 +1687,16 @@
         int           pool_stride_x   = 0;
         int           pool_stride_y   = 0;
         std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
-        const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
-        const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+        const int upper_bound_w = _src->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+        const int upper_bound_h = _src->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
 
-        const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-        const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+        const uint8_t *const src_top_ptr    = _src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+        const uint8_t *const src_bottom_ptr = _src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
 
         execute_window_loop(window, [&](const Coordinates & id)
         {
-            const auto  in_top_ptr    = reinterpret_cast<const float *>(input_top_ptr + input.offset());
-            const auto  in_bottom_ptr = reinterpret_cast<const float *>(input_bottom_ptr + input.offset());
+            const auto  in_top_ptr    = reinterpret_cast<const float *>(src_top_ptr + src.offset());
+            const auto  in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + src.offset());
             float32x2_t top_data      = vld1_f32(in_top_ptr);
             float32x2_t bottom_data   = vld1_f32(in_bottom_ptr);
             float32x2_t res           = {};
@@ -1745,16 +1732,16 @@
             }
 
             // Store result
-            *(reinterpret_cast<float *>(output.ptr())) = final_res;
+            *(reinterpret_cast<float *>(dst.ptr())) = final_res;
         },
-        input, output);
+        src, dst);
     }
 }
 
-void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+void CpuPoolingKernel::pooling3_f32_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
-    Iterator input(_input, window_input);
-    Iterator output(_output, window);
+    Iterator src(_src, window_src);
+    Iterator dst(_dst, window);
 
     constexpr const int pool_size       = 3;
     const int           pool_pad_right  = _pool_info.pad_stride_info.pad_right();
@@ -1764,18 +1751,18 @@
     int                 pool_stride_x   = 0;
     int                 pool_stride_y   = 0;
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
-    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+    const int upper_bound_w = _src->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _src->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
 
-    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
+    const uint8_t *const src_top_ptr    = _src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const uint8_t *const src_middle_ptr = _src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+    const uint8_t *const src_bottom_ptr = _src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        float32x4_t top_data    = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
-        float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
-        float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
+        float32x4_t top_data    = vld1q_f32(reinterpret_cast<const float *>(src_top_ptr + src.offset()));
+        float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(src_middle_ptr + src.offset()));
+        float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(src_bottom_ptr + src.offset()));
         float32x2_t res         = {};
         float       final_res   = 0;
 
@@ -1813,15 +1800,15 @@
         }
 
         // Store result
-        *(reinterpret_cast<float *>(output.ptr())) = final_res;
+        *(reinterpret_cast<float *>(dst.ptr())) = final_res;
     },
-    input, output);
+    src, dst);
 }
 
-void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+void CpuPoolingKernel::pooling7_f32_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
-    Iterator input(_input, window_input);
-    Iterator output(_output, window);
+    Iterator src(_src, window_src);
+    Iterator dst(_dst, window);
 
     constexpr const int pool_size       = 7;
     const int           pool_pad_right  = _pool_info.pad_stride_info.pad_right();
@@ -1831,13 +1818,13 @@
     int                 pool_stride_x   = 0;
     int                 pool_stride_y   = 0;
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
-    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+    const int upper_bound_w = _src->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _src->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
 
-    std::array<const uint8_t *, pool_size> input_ptrs{ {} };
+    std::array<const uint8_t *, pool_size> src_ptrs{ {} };
     for(int i = 0; i < pool_size; ++i)
     {
-        input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
+        src_ptrs[i] = _src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
     }
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -1851,7 +1838,7 @@
             const float32x2_t scale_v = vdup_n_f32(scale);
 
             // Perform pooling
-            float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
+            float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[0] + src.offset()));
             // Get power of 2 in case of l2 pooling
             if(pooling_type == PoolingType::L2)
             {
@@ -1861,7 +1848,7 @@
             float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
             for(int i = 1; i < pool_size; ++i)
             {
-                data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
+                data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[i] + src.offset()));
                 // Get power of 2 in case of l2 pooling
                 if(pooling_type == PoolingType::L2)
                 {
@@ -1876,10 +1863,10 @@
         }
         else
         {
-            float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
+            float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[0] + src.offset()));
             for(int i = 1; i < pool_size; ++i)
             {
-                const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
+                const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[i] + src.offset()));
                 max_data                 = vmax2q_f32(max_data, data);
             }
             res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
@@ -1895,16 +1882,16 @@
         }
 
         // Store result
-        *(reinterpret_cast<float *>(output.ptr())) = final_res;
+        *(reinterpret_cast<float *>(dst.ptr())) = final_res;
     },
-    input, output);
+    src, dst);
 }
 
-void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+void CpuPoolingKernel::poolingMxN_f32_nhwc(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
     if(_pool_info.pool_size == Size2D(2, 2) && pooling_type == PoolingType::MAX && _indices)
     {
-        pooling2_f32_nhwc_maxpool_indices(window_input, window);
+        pooling2_f32_nhwc_maxpool_indices(window_src, window);
     }
     else
     {
@@ -1915,11 +1902,11 @@
         Window window_out = window;
         window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-        Iterator input(_input, window_input);
-        Iterator output(_output, window_out);
+        Iterator src(_src, window_src);
+        Iterator dst(_dst, window_out);
 
-        const int pool_size_x     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
-        const int pool_size_y     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
+        const int pool_size_x     = _pool_info.is_global_pooling ? _src->info()->tensor_shape().y() : _pool_info.pool_size.width;
+        const int pool_size_y     = _pool_info.is_global_pooling ? _src->info()->tensor_shape().z() : _pool_info.pool_size.height;
         const int pool_pad_right  = _pool_info.pad_stride_info.pad_right();
         const int pool_pad_top    = _pool_info.pad_stride_info.pad_top();
         const int pool_pad_left   = _pool_info.pad_stride_info.pad_left();
@@ -1927,8 +1914,8 @@
         int       pool_stride_x   = 0;
         int       pool_stride_y   = 0;
         std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
-        const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
-        const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
+        const int upper_bound_w = _src->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
+        const int upper_bound_h = _src->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
 
         float32x4_t vres;
 
@@ -1939,10 +1926,10 @@
             const int pool_limit_y = pool_pad_top - idx_height;
             const int pool_limit_x = pool_pad_left - idx_width;
 
-            const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
-            const int pool_end_y   = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
-            const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
-            const int pool_end_x   = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
+            const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+            const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
+            const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+            const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
 
             int x_off = window_start_x;
             for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
@@ -1961,8 +1948,8 @@
                     {
                         for(int x = pool_start_x; x < pool_end_x; ++x)
                         {
-                            const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                               (_input->info()->strides_in_bytes().z())) + x_off);
+                            const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+                                                                                               (_src->info()->strides_in_bytes().z())) + x_off);
 
                             // Get power of 2 in case of l2 pooling and accumulate
                             if(pooling_type == PoolingType::L2)
@@ -1985,8 +1972,8 @@
                     {
                         for(int x = pool_start_x; x < pool_end_x; ++x)
                         {
-                            const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                               (_input->info()->strides_in_bytes().z())) + x_off);
+                            const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+                                                                                               (_src->info()->strides_in_bytes().z())) + x_off);
                             vres                   = vmaxq_f32(vres, data);
                         }
                     }
@@ -2004,7 +1991,7 @@
                 }
 
                 // Store result
-                vst1q_f32(reinterpret_cast<float *>(output.ptr()) + x_off, vres);
+                vst1q_f32(reinterpret_cast<float *>(dst.ptr()) + x_off, vres);
             }
 
             // Left-overs loop
@@ -2022,8 +2009,8 @@
                     {
                         for(int x = pool_start_x; x < pool_end_x; ++x)
                         {
-                            const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                 (_input->info()->strides_in_bytes().z())) + x_off);
+                            const float data = *(reinterpret_cast<const float *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+                                                                                 (_src->info()->strides_in_bytes().z())) + x_off);
 
                             // Get power of 2 in case of l2 pooling and accumulate
                             if(pooling_type == PoolingType::L2)
@@ -2047,8 +2034,8 @@
                     {
                         for(int x = pool_start_x; x < pool_end_x; ++x)
                         {
-                            const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                 (_input->info()->strides_in_bytes().z())) + x_off);
+                            const float data = *(reinterpret_cast<const float *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+                                                                                 (_src->info()->strides_in_bytes().z())) + x_off);
                             res              = std::max(res, data);
                         }
                     }
@@ -2061,14 +2048,14 @@
                 }
 
                 // Store result
-                *(reinterpret_cast<float *>(output.ptr()) + x_off) = res;
+                *(reinterpret_cast<float *>(dst.ptr()) + x_off) = res;
             }
         },
-        input, output);
+        src, dst);
     }
 }
 
-void NEPoolingLayerKernel::pooling2_f32_nhwc_maxpool_indices(const Window &window_input, const Window &window)
+void CpuPoolingKernel::pooling2_f32_nhwc_maxpool_indices(const Window &window_src, const Window &window)
 {
     const int window_start_x = window.x().start();
     const int window_end_x   = window.x().end();
@@ -2077,8 +2064,8 @@
     Window window_out = window;
     window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    Iterator input(_input, window_input);
-    Iterator output(_output, window_out);
+    Iterator src(_src, window_src);
+    Iterator dst(_dst, window_out);
     Iterator indices(_indices, window_out);
 
     const int pool_pad_top  = _pool_info.pad_stride_info.pad_top();
@@ -2091,9 +2078,9 @@
     float32x4_t vres;
     float       res;
 
-    const int pad_right   = _input->info()->padding().right;
-    const int in_stride_y = static_cast<int>(_input->info()->strides_in_bytes().y());
-    const int in_stride_z = static_cast<int>(_input->info()->strides_in_bytes().z());
+    const int pad_right   = _src->info()->padding().right;
+    const int in_stride_y = static_cast<int>(_src->info()->strides_in_bytes().y());
+    const int in_stride_z = static_cast<int>(_src->info()->strides_in_bytes().z());
 
     execute_window_loop(window_out, [&](const Coordinates & id)
     {
@@ -2102,37 +2089,37 @@
         const int pool_limit_y = pool_pad_top - idx_height;
         const int pool_limit_x = pool_pad_left - idx_width;
 
-        const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
-        const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
+        const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+        const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
 
-        const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
-                                 (_input->info()->strides_in_bytes().z());
-        const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
-                                 (_input->info()->strides_in_bytes().z());
-        const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
-                                 (_input->info()->strides_in_bytes().z());
-        const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
-                                 (_input->info()->strides_in_bytes().z());
+        const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
+                                 (_src->info()->strides_in_bytes().z());
+        const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
+                                 (_src->info()->strides_in_bytes().z());
+        const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
+                                 (_src->info()->strides_in_bytes().z());
+        const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
+                                 (_src->info()->strides_in_bytes().z());
 
         int x_off = window_start_x;
         for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
         {
-            const auto in_x0_ptr = reinterpret_cast<const float *>(input.ptr() + in_x0_offset);
-            const auto in_x1_ptr = reinterpret_cast<const float *>(input.ptr() + in_x1_offset);
-            const auto in_x2_ptr = reinterpret_cast<const float *>(input.ptr() + in_x2_offset);
-            const auto in_x3_ptr = reinterpret_cast<const float *>(input.ptr() + in_x3_offset);
+            const auto in_x0_ptr = reinterpret_cast<const float *>(src.ptr() + in_x0_offset);
+            const auto in_x1_ptr = reinterpret_cast<const float *>(src.ptr() + in_x1_offset);
+            const auto in_x2_ptr = reinterpret_cast<const float *>(src.ptr() + in_x2_offset);
+            const auto in_x3_ptr = reinterpret_cast<const float *>(src.ptr() + in_x3_offset);
             const auto v_x0      = vld1q_f32(in_x0_ptr + x_off);
             const auto v_x1      = vld1q_f32(in_x1_ptr + x_off);
             const auto v_x2      = vld1q_f32(in_x2_ptr + x_off);
             const auto v_x3      = vld1q_f32(in_x3_ptr + x_off);
             vres                 = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1));
             // Store result
-            vst1q_f32(reinterpret_cast<float *>(output.ptr()) + x_off, vres);
+            vst1q_f32(reinterpret_cast<float *>(dst.ptr()) + x_off, vres);
 
-            const uint32_t   offset_base  = offset_no_padding<float>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
+            const uint32_t   offset_base  = offset_no_padding<float>(src.offset(), id, *_src->info(), pool_stride_x, pool_stride_y);
             const uint32_t   offset_x0    = (uint32_t)offset_base / sizeof(float) + x_off;
             const uint32_t   offset_x1    = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_right;
-            const uint32_t   offset_x2    = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_right * _input->info()->tensor_shape()[1];
+            const uint32_t   offset_x2    = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_right * _src->info()->tensor_shape()[1];
             const uint32_t   offset_x3    = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_right;
             const uint32x4_t voffset_x0   = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
             const uint32x4_t voffset_x1   = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
@@ -2149,19 +2136,19 @@
         // Left-overs loop
         for(; x_off < window_end_x; ++x_off)
         {
-            const auto x0 = *(reinterpret_cast<const float *>(input.ptr() + in_x0_offset) + x_off);
-            const auto x1 = *(reinterpret_cast<const float *>(input.ptr() + in_x1_offset) + x_off);
-            const auto x2 = *(reinterpret_cast<const float *>(input.ptr() + in_x2_offset) + x_off);
-            const auto x3 = *(reinterpret_cast<const float *>(input.ptr() + in_x3_offset) + x_off);
+            const auto x0 = *(reinterpret_cast<const float *>(src.ptr() + in_x0_offset) + x_off);
+            const auto x1 = *(reinterpret_cast<const float *>(src.ptr() + in_x1_offset) + x_off);
+            const auto x2 = *(reinterpret_cast<const float *>(src.ptr() + in_x2_offset) + x_off);
+            const auto x3 = *(reinterpret_cast<const float *>(src.ptr() + in_x3_offset) + x_off);
             res           = std::max(std::max(x2, x3), std::max(x0, x1));
 
             // Store result
-            *(reinterpret_cast<float *>(output.ptr()) + x_off) = res;
+            *(reinterpret_cast<float *>(dst.ptr()) + x_off) = res;
 
-            const uint32_t offset_base = offset_no_padding<float>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
+            const uint32_t offset_base = offset_no_padding<float>(src.offset(), id, *_src->info(), pool_stride_x, pool_stride_y);
             const uint32_t offset_x0   = (uint32_t)offset_base / sizeof(float) + x_off;
             const uint32_t offset_x1   = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_right;
-            const uint32_t offset_x2   = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_right * _input->info()->tensor_shape()[1];
+            const uint32_t offset_x2   = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_right * _src->info()->tensor_shape()[1];
             const uint32_t offset_x3   = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_right;
             const uint32_t tmp_idx0    = (x0 >= x1) ? offset_x0 : offset_x1;
             const uint32_t tmp_idx1    = (x2 >= x3) ? offset_x2 : offset_x3;
@@ -2171,14 +2158,14 @@
             *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
         }
     },
-    input, output, indices);
+    src, dst, indices);
 }
 
 template <typename T>
-void NEPoolingLayerKernel::poolingMxN_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+void CpuPoolingKernel::poolingMxN_q8_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
-    Iterator input(_input, window_input);
-    Iterator output(_output, window);
+    Iterator src(_src, window_src);
+    Iterator dst(_dst, window);
 
     /** NEON vector types */
     using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
@@ -2187,8 +2174,8 @@
     using q32_t   = typename wrapper::traits::promote_t<q16_t>;
     using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
 
-    const int pool_size_x     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
-    const int pool_size_y     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
+    const int pool_size_x     = _pool_info.is_global_pooling ? _src->info()->tensor_shape().x() : _pool_info.pool_size.width;
+    const int pool_size_y     = _pool_info.is_global_pooling ? _src->info()->tensor_shape().y() : _pool_info.pool_size.height;
     const int pool_pad_right  = _pool_info.pad_stride_info.pad_right();
     const int pool_pad_top    = _pool_info.pad_stride_info.pad_top();
     const int pool_pad_left   = _pool_info.pad_stride_info.pad_left();
@@ -2196,11 +2183,11 @@
     int       pool_stride_x   = 0;
     int       pool_stride_y   = 0;
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
-    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+    const int upper_bound_w = _src->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _src->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
 
-    const UniformQuantizationInfo &input_qinfo  = _input->info()->quantization_info().uniform();
-    const UniformQuantizationInfo &output_qinfo = _output->info()->quantization_info().uniform();
+    const UniformQuantizationInfo &src_qinfo = _src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo &dst_qinfo = _dst->info()->quantization_info().uniform();
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
@@ -2220,8 +2207,8 @@
                 int x = 0;
                 for(; x <= (pool_size_x - 8); x += 8)
                 {
-                    const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                                                   (_input->info()->strides_in_bytes().y())));
+                    const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
+                                                                                   (_src->info()->strides_in_bytes().y())));
 
                     const q16x8_t data_q16 = wrapper::vmovl(data);
                     vres                   = wrapper::vadd(vres, wrapper::vaddl(wrapper::vgethigh(data_q16), wrapper::vgetlow(data_q16)));
@@ -2230,8 +2217,8 @@
                 // Leftover for loop
                 for(; x < pool_size_x; ++x)
                 {
-                    T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                           (_input->info()->strides_in_bytes().y())));
+                    T data = *(reinterpret_cast<const T *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
+                                                           (_src->info()->strides_in_bytes().y())));
                     sres += data;
                 }
             }
@@ -2252,15 +2239,15 @@
                 int x = 0;
                 for(; x <= (pool_size_x - 8); x += 8)
                 {
-                    const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                                                   (_input->info()->strides_in_bytes().y())));
+                    const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
+                                                                                   (_src->info()->strides_in_bytes().y())));
                     vres              = wrapper::vmax(vres, data);
                 }
                 // Leftover for loop
                 for(; x < pool_size_x; ++x)
                 {
-                    const T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                                 (_input->info()->strides_in_bytes().y())));
+                    const T data = *(reinterpret_cast<const T *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
+                                                                 (_src->info()->strides_in_bytes().y())));
                     res          = std::max(res, data);
                 }
             }
@@ -2274,14 +2261,14 @@
             res = std::max(res, wrapper::vgetlane(vres, 0));
         }
         // Store result
-        res                                    = (input_qinfo != output_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(Qasymm8QuantizationHelper<T>::dequantize(res, input_qinfo), output_qinfo) : res;
-        *(reinterpret_cast<T *>(output.ptr())) = res;
+        res                                 = (src_qinfo != dst_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(Qasymm8QuantizationHelper<T>::dequantize(res, src_qinfo), dst_qinfo) : res;
+        *(reinterpret_cast<T *>(dst.ptr())) = res;
     },
-    input, output);
+    src, dst);
 }
 
 template <typename T>
-void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+void CpuPoolingKernel::poolingMxN_q8_nhwc(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
     const int window_start_x     = window.x().start();
     const int window_end_x       = window.x().end();
@@ -2291,8 +2278,8 @@
     Window window_out = window;
     window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    Iterator input(_input, window_input);
-    Iterator output(_output, window_out);
+    Iterator src(_src, window_src);
+    Iterator dst(_dst, window_out);
 
     using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
     using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
@@ -2301,8 +2288,8 @@
     using q32_t   = typename wrapper::traits::promote_t<q16_t>;
     using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
 
-    const int pool_size_x     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
-    const int pool_size_y     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
+    const int pool_size_x     = _pool_info.is_global_pooling ? _src->info()->tensor_shape().y() : _pool_info.pool_size.width;
+    const int pool_size_y     = _pool_info.is_global_pooling ? _src->info()->tensor_shape().z() : _pool_info.pool_size.height;
     const int pool_pad_right  = _pool_info.pad_stride_info.pad_right();
     const int pool_pad_top    = _pool_info.pad_stride_info.pad_top();
     const int pool_pad_left   = _pool_info.pad_stride_info.pad_left();
@@ -2311,20 +2298,20 @@
     int pool_stride_x = 0;
     int pool_stride_y = 0;
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
-    const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
+    const int upper_bound_w = _src->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _src->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
 
     const float32x4_t             half_scale_v = vdupq_n_f32(0.5f);
-    const UniformQuantizationInfo input_qinfo  = _input->info()->quantization_info().uniform();
-    const UniformQuantizationInfo output_qinfo = _output->info()->quantization_info().uniform();
+    const UniformQuantizationInfo src_qinfo    = _src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo dst_qinfo    = _dst->info()->quantization_info().uniform();
 
-    const float quant_rescale = output_qinfo.scale / input_qinfo.scale;
+    const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
     // "new_offset" doesn't have to consider the "half_scale_v" in its computation
     // With a requantization performed in a single step there won't be uncertainties introduced
-    const int32_t new_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / quant_rescale);
+    const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
 
-    const float                   requant_scale  = output_qinfo.scale / input_qinfo.scale;
-    const int32_t                 requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
+    const float                   requant_scale  = dst_qinfo.scale / src_qinfo.scale;
+    const int32_t                 requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
     const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
 
     execute_window_loop(window_out, [&](const Coordinates & id)
@@ -2334,10 +2321,10 @@
         const int pool_limit_y = pool_pad_top - idx_height;
         const int pool_limit_x = pool_pad_left - idx_width;
 
-        const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
-        const int pool_end_y   = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
-        const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
-        const int pool_end_x   = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
+        const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+        const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
+        const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+        const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
 
         int x_off = window_start_x;
         for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
@@ -2358,8 +2345,8 @@
                 {
                     for(int x = pool_start_x; x < pool_end_x; ++x)
                     {
-                        const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                         (_input->info()->strides_in_bytes().z())) + x_off);
+                        const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+                                                                                         (_src->info()->strides_in_bytes().z())) + x_off);
 
                         const q16x8_t data_q16  = wrapper::vmovl(wrapper::vgetlow(data));
                         const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
@@ -2370,7 +2357,7 @@
                     }
                 }
 
-                if(input_qinfo != output_qinfo)
+                if(src_qinfo != dst_qinfo)
                 {
                     const float32x4x4_t vres =
                     {
@@ -2381,10 +2368,10 @@
                             vcvtq_f32_q32(vres4),
                         }
                     };
-                    const auto requantized_output = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
+                    const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
                     // Store result
-                    wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off, wrapper::vgetlow(requantized_output));
-                    wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off + 8, wrapper::vgethigh(requantized_output));
+                    wrapper::vstore(reinterpret_cast<T *>(dst.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
+                    wrapper::vstore(reinterpret_cast<T *>(dst.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
                 }
                 else
                 {
@@ -2398,8 +2385,8 @@
                     const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
                     const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
                     // Store result
-                    wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off, res1);
-                    wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off + 8, res2);
+                    wrapper::vstore(reinterpret_cast<T *>(dst.ptr()) + x_off, res1);
+                    wrapper::vstore(reinterpret_cast<T *>(dst.ptr()) + x_off + 8, res2);
                 }
             }
             else
@@ -2410,14 +2397,14 @@
                 {
                     for(int x = pool_start_x; x < pool_end_x; ++x)
                     {
-                        const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                         (_input->info()->strides_in_bytes().z())) + x_off);
+                        const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+                                                                                         (_src->info()->strides_in_bytes().z())) + x_off);
                         vres               = wrapper::vmax(vres, data);
                     }
                 }
 
                 // Store result
-                wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off, (input_qinfo != output_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
+                wrapper::vstore(reinterpret_cast<T *>(dst.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
                                 requant_qinfo) :
                                 vres);
             }
@@ -2432,15 +2419,15 @@
                 {
                     for(int x = pool_start_x; x < pool_end_x; ++x)
                     {
-                        const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                       (_input->info()->strides_in_bytes().z())) + x_off);
+                        const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+                                                                                       (_src->info()->strides_in_bytes().z())) + x_off);
                         vres              = wrapper::vmax(vres, data);
                     }
                 }
 
                 // Store result
-                wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off,
-                                (input_qinfo != output_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
+                wrapper::vstore(reinterpret_cast<T *>(dst.ptr()) + x_off,
+                                (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
             }
         }
 
@@ -2460,20 +2447,20 @@
                 {
                     for(int x = pool_start_x; x < pool_end_x; ++x)
                     {
-                        const T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                     (_input->info()->strides_in_bytes().z())) + x_off);
+                        const T data = *(reinterpret_cast<const T *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+                                                                     (_src->info()->strides_in_bytes().z())) + x_off);
                         res += data;
                     }
                 }
 
-                if(input_qinfo != output_qinfo)
+                if(src_qinfo != dst_qinfo)
                 {
-                    const float res_f              = static_cast<float>(res);
-                    const float new_scale          = quant_rescale / scale;
-                    const auto  requantized_output = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
+                    const float res_f           = static_cast<float>(res);
+                    const float new_scale       = quant_rescale / scale;
+                    const auto  requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
 
                     // Store result
-                    *(reinterpret_cast<T *>(output.ptr()) + x_off) = requantized_output;
+                    *(reinterpret_cast<T *>(dst.ptr()) + x_off) = requantized_dst;
                 }
                 else
                 {
@@ -2481,7 +2468,7 @@
                     res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
 
                     // Store result
-                    *(reinterpret_cast<T *>(output.ptr()) + x_off) = res;
+                    *(reinterpret_cast<T *>(dst.ptr()) + x_off) = res;
                 }
             }
             else
@@ -2492,32 +2479,32 @@
                 {
                     for(int x = pool_start_x; x < pool_end_x; ++x)
                     {
-                        const T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                     (_input->info()->strides_in_bytes().z())) + x_off);
+                        const T data = *(reinterpret_cast<const T *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+                                                                     (_src->info()->strides_in_bytes().z())) + x_off);
                         res          = std::max(res, data);
                     }
                 }
 
                 // Store result
-                if(input_qinfo != output_qinfo)
+                if(src_qinfo != dst_qinfo)
                 {
-                    const float res_f                              = static_cast<float>(res);
-                    *(reinterpret_cast<T *>(output.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
+                    const float res_f                           = static_cast<float>(res);
+                    *(reinterpret_cast<T *>(dst.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
                 }
                 else
                 {
-                    *(reinterpret_cast<T *>(output.ptr()) + x_off) = res;
+                    *(reinterpret_cast<T *>(dst.ptr()) + x_off) = res;
                 }
             }
         }
 
     },
-    input, output);
+    src, dst);
 }
 
-Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status CpuPoolingKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
 
     unsigned int pooled_w                          = 0;
     unsigned int pooled_h                          = 0;
@@ -2529,25 +2516,25 @@
     unsigned int pool_size_y       = 0;
 
     // Get data layout
-    const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : pool_info.data_layout;
+    const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
     const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
-    pool_size_x = is_global_pooling ? input->dimension(idx_width) : pool_info.pool_size.width;
-    pool_size_y = is_global_pooling ? input->dimension(idx_height) : pool_info.pool_size.height;
+    pool_size_x = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+    pool_size_y = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
 
     // Validate pool info before calling scaled_dimensions
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(pool_size_x, pool_size_y));
 
-    // Check output dimensions
-    std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
-                                                     input->dimension(idx_height),
+    // Check dst dimensions
+    std::tie(pooled_w, pooled_h) = scaled_dimensions(src->dimension(idx_width),
+                                                     src->dimension(idx_height),
                                                      pool_size_x,
                                                      pool_size_y,
                                                      pool_info.pad_stride_info);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h, indices, Size2D(pool_size_x, pool_size_y)));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(),
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, pooled_w, pooled_h, indices, Size2D(pool_size_x, pool_size_y)));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(),
                                                               (indices) ? indices->clone().get() : nullptr, pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h,
                                                               pool_size_x, pool_size_y)
                                 .first);
@@ -2555,24 +2542,28 @@
     return Status{};
 }
 
-void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
+void CpuPoolingKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
+    _src     = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    _dst     = tensors.get_tensor(TensorType::ACL_DST_0);
+    _indices = tensors.get_tensor(TensorType::ACL_DST_1);
+
     const unsigned int pool_stride_x   = _pool_info.pad_stride_info.stride().first;
     const unsigned int pool_stride_y   = _pool_info.pad_stride_info.stride().second;
     const unsigned int pool_size       = _pool_info.pool_size.width;
     const bool         exclude_padding = _pool_info.exclude_padding;
 
-    Window window_input(window);
+    Window window_src(window);
     if(_data_layout == DataLayout::NCHW)
     {
-        // Set step for input in x and y direction for the input
+        // Set step for src in x and y direction for the src
         unsigned int window_x_inc = 0;
-        switch(_input->info()->data_type())
+        switch(_src->info()->data_type())
         {
             case DataType::QASYMM8:
             case DataType::QASYMM8_SIGNED:
@@ -2596,17 +2587,19 @@
                 ARM_COMPUTE_ERROR("Not supported");
             }
         }
-        window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
-        window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
+        window_src.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
+        window_src.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
     }
     else
     {
-        window_input.set(Window::DimX, Window::Dimension(0, 1, 1));
-        window_input.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), pool_stride_x));
-        window_input.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), pool_stride_y));
+        window_src.set(Window::DimX, Window::Dimension(0, 1, 1));
+        window_src.set(Window::DimY, Window::Dimension(0, _src->info()->dimension(1), pool_stride_x));
+        window_src.set(Window::DimZ, Window::Dimension(0, _src->info()->dimension(2), pool_stride_y));
     }
 
     // Run function
-    (this->*_func)(window_input, window, _pool_info.pool_type, exclude_padding);
+    (this->*_func)(window_src, window, _pool_info.pool_type, exclude_padding);
 }
+} // namespace kernels
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuPoolingKernel.h b/src/core/cpu/kernels/CpuPoolingKernel.h
new file mode 100644
index 0000000..036e436
--- /dev/null
+++ b/src/core/cpu/kernels/CpuPoolingKernel.h
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_POOLING_KERNEL_H
+#define ARM_COMPUTE_CPU_POOLING_KERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the pooling layer kernel */
+class CpuPoolingKernel : public ICpuKernel
+{
+public:
+    const char *name() const override
+    {
+        return "CpuPoolingKernel";
+    }
+    /** Default constructor */
+    CpuPoolingKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPoolingKernel);
+    /** Configure kernel for a given list of arguments
+     *
+     * @note F16 are supported for pool sizes 2 and 3 only
+     *
+     * @param[in]  src       Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out] dst       Destination tensor info. Data types supported: Same as @p src.
+     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[out] indices   (optional) The indices of the maximal values. Data type supported: U32.
+     */
+    void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
+    /** Static function to check if given info will lead to a valid configuration of @ref CpuPoolingKernel
+     *
+     * @note F16 are supported for pool sizes 2 and 3 only
+     *
+     * @param[in] src       Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] dst       Destination tensor info. Data types supported: Same as @p src.
+     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[in] indices   (optional) The indices of the maximal values. Data type supported: U32.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Function to perform 2x2 pooling.
+     *
+     * @param[in] window_src      src region on which to execute the kernel.
+     * @param[in] window          dst region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    void pooling2_f32_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform 2x2 pooling and compute the pooling indices. The indices can be used for max unpool.
+     *
+     * @param[in] window_src src region on which to execute the kernel.
+     * @param[in] window     dst region on which to execute the kernel.
+     */
+    void pooling2_f32_nhwc_maxpool_indices(const Window &window_src, const Window &window);
+    /** Function to perform MxN pooling for 32-bit floating point values.
+     *
+     * @param[in] window_src      src region on which to execute the kernel.
+     * @param[in] window          dst region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    void poolingMxN_f32_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform MxN pooling for 32-bit floating point values (NHWC).
+     *
+     * @param[in] window_src      src region on which to execute the kernel.
+     * @param[in] window          dst region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    void poolingMxN_f32_nhwc(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform 7x7 pooling.
+     *
+     * @param[in] window_src      src region on which to execute the kernel.
+     * @param[in] window          dst region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    void pooling7_f32_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform 3x3 pooling.
+     *
+     * @param[in] window_src      src region on which to execute the kernel.
+     * @param[in] window          dst region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    void pooling3_f32_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform 2x2 pooling for float16_t.
+     *
+     * @param[in] window_src      src region on which to execute the kernel.
+     * @param[in] window          dst region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    void pooling2_f16_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform 2x2 pooling and compute the pooling indices for FP32/FP16. The indices can be used for max unpool.
+     *
+     * @param[in] window_src src region on which to execute the kernel.
+     * @param[in] window     dst region on which to execute the kernel.
+     */
+    template <typename T>
+    void pooling2_nchw_maxpool_indices(const Window &window_src, const Window &window);
+    /** Function to perform 2x2 pooling and compute the pooling indices. The indices can be used for max unpool.
+     *
+     * @param[in] window_src src region on which to execute the kernel.
+     * @param[in] window     dst region on which to execute the kernel.
+     */
+    void pooling2_f16_nhwc_maxpool_indices(const Window &window_src, const Window &window);
+    /** Function to perform 3x3 pooling.
+     *
+     * @param[in] window_src      src region on which to execute the kernel.
+     * @param[in] window          dst region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    void pooling3_f16_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform MxN pooling for 16-bit floating point values.
+     *
+     * @param[in] window_src      src region on which to execute the kernel.
+     * @param[in] window          dst region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    void poolingMxN_f16_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform MxN pooling for 16-bit floating point values. (NHWC)
+     *
+     * @param[in] window_src      src region on which to execute the kernel.
+     * @param[in] window          dst region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    void poolingMxN_f16_nhwc(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Template function to perform 2x2 pooling for 8bit quantized fixed point. (NCHW)
+     *
+     * @param[in] window_src      src region on which to execute the kernel.
+     * @param[in] window          dst region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    template <typename T>
+    void pooling2_q8_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Template function to perform 3x3 pooling for 8bit quantized fixed point. (NCHW)
+     *
+     * @param[in] window_src      src region on which to execute the kernel.
+     * @param[in] window          dst region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    template <typename T>
+    void pooling3_q8_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Template function to perform MxN pooling for 8-bit quantized. (NCHW)
+     *
+     * @param[in] window_src      src region on which to execute the kernel.
+     * @param[in] window          dst region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    template <typename T>
+    void poolingMxN_q8_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Template function to perform MxN pooling for 8-bit quantized. (NHWC)
+     *
+     * @param[in] window_src      src region on which to execute the kernel.
+     * @param[in] window          dst region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    template <typename T>
+    void poolingMxN_q8_nhwc(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Common signature for all the specialised Pooling functions
+     *
+     * @param[in] window_src      src region on which to execute the kernel.
+     * @param[in] window          dst region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
+     */
+    using PoolingFunction = void (CpuPoolingKernel::*)(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding);
+
+private:
+    PoolingFunction  _func{ nullptr };
+    const ITensor   *_src{ nullptr };
+    ITensor         *_dst{ nullptr };
+    ITensor         *_indices{ nullptr };
+    PoolingLayerInfo _pool_info{};
+    DataLayout       _data_layout{ DataLayout::UNKNOWN };
+    unsigned int     _num_elems_processed_per_iteration{ 0 };
+    BorderSize       _border_size{ 0 };
+    bool             _is_square{ false };
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CPU_POOLING_KERNEL_H */
diff --git a/src/runtime/NEON/functions/NEPoolingAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEPoolingAssemblyDispatch.cpp
deleted file mode 100644
index 427cd2e..0000000
--- a/src/runtime/NEON/functions/NEPoolingAssemblyDispatch.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/NEON/functions/NEPoolingAssemblyDispatch.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/assembly/NEPoolingAssemblyWrapperKernel.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-NEPoolingAssemblyDispatch::~NEPoolingAssemblyDispatch() = default;
-
-void NEPoolingAssemblyDispatch::configure(const ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &info)
-{
-    const CPUInfo     &ci          = NEScheduler::get().cpu_info();
-    const unsigned int num_threads = NEScheduler::get().num_threads();
-
-    // If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
-    if(!NEPoolingAssemblyDispatch::validate(input, output, info))
-    {
-        return;
-    }
-
-    auto pooling_wrapper = std::make_unique<NEPoolingAssemblyWrapperKernel>();
-    ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr);
-    pooling_wrapper->configure(input, output, info, ci);
-
-    // Check if we have Global Pooling Layer
-    _is_global_pooling_layer = (input->dimension(2) == info.pool_size.width) && (input->dimension(1) == info.pool_size.height);
-
-    // Set workspace requirements
-    const unsigned int alignment = 4096;
-    _workspace.push_back(MemoryInfo(TensorType::ACL_DST_1, pooling_wrapper->get_working_size(num_threads), alignment));
-
-    _kernel = std::move(pooling_wrapper);
-}
-
-Status NEPoolingAssemblyDispatch::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &info)
-{
-    return NEPoolingAssemblyWrapperKernel::validate(input, output, info);
-}
-
-bool NEPoolingAssemblyDispatch::is_configured() const
-{
-    return _kernel != nullptr;
-}
-
-void NEPoolingAssemblyDispatch::run(ITensorPack &tensors)
-{
-    if(tensors.empty())
-    {
-        ARM_COMPUTE_ERROR("No inputs provided");
-    }
-
-    if(_is_global_pooling_layer)
-    {
-        NEScheduler::get().schedule_op(_kernel.get(), Window::DimX, _kernel->window(), tensors);
-    }
-    else
-    {
-        NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
-    }
-}
-} // namespace experimental
-
-struct NEPoolingAssemblyDispatch::Impl
-{
-    const ITensor                                           *src{ nullptr };
-    ITensor                                                 *dst{ nullptr };
-    ITensor                                                 *workspace{ nullptr };
-    std::unique_ptr<experimental::NEPoolingAssemblyDispatch> op{ nullptr };
-};
-
-NEPoolingAssemblyDispatch::NEPoolingAssemblyDispatch(NEPoolingAssemblyDispatch &&) = default;
-
-NEPoolingAssemblyDispatch &NEPoolingAssemblyDispatch::operator=(NEPoolingAssemblyDispatch &&) = default;
-
-NEPoolingAssemblyDispatch::~NEPoolingAssemblyDispatch() = default;
-
-NEPoolingAssemblyDispatch::NEPoolingAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager)
-    : _impl(std::make_unique<Impl>()),
-      _memory_group(std::move(memory_manager)),
-      _workspace()
-{
-}
-
-void NEPoolingAssemblyDispatch::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    _impl->src       = input;
-    _impl->dst       = output;
-    _impl->workspace = &_workspace;
-
-    _impl->op = std::make_unique<experimental::NEPoolingAssemblyDispatch>();
-    _impl->op->configure(input->info(), output->info(), info);
-
-    const auto workspace = _impl->op->workspace().at(0);
-    if(workspace.size > 0)
-    {
-        // Allocate workspace
-        allocate_workspace(workspace.size, workspace.alignment);
-    }
-}
-
-Status NEPoolingAssemblyDispatch::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &info)
-{
-    return experimental::NEPoolingAssemblyDispatch::validate(input, output, info);
-}
-
-bool NEPoolingAssemblyDispatch::is_configured() const
-{
-    return _impl->op->is_configured();
-}
-
-void NEPoolingAssemblyDispatch::run()
-{
-    ITensorPack pack;
-    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
-    pack.add_tensor(TensorType::ACL_DST_0, _impl->dst);
-    pack.add_tensor(TensorType::ACL_DST_1, _impl->workspace);
-    _impl->op->run(pack);
-}
-
-void NEPoolingAssemblyDispatch::allocate_workspace(size_t workspace_size, size_t alignment)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "size cannot be 0");
-    _workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment);
-    _memory_group.manage(&_workspace);
-    _workspace.allocator()->allocate();
-}
-} //namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPoolingAssemblyDispatch.h b/src/runtime/NEON/functions/NEPoolingAssemblyDispatch.h
deleted file mode 100644
index f6d232b..0000000
--- a/src/runtime/NEON/functions/NEPoolingAssemblyDispatch.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEPOOLINGASSEMBLYDISPATCH_H
-#define ARM_COMPUTE_NEPOOLINGASSEMBLYDISPATCH_H
-
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/INEOperator.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward Declarations
-class ITensor;
-struct PoolingLayerInfo;
-
-/** Assembly kernel glue */
-class NEPoolingAssemblyDispatch : public IFunction
-{
-public:
-    /** Constructor */
-    NEPoolingAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEPoolingAssemblyDispatch(const NEPoolingAssemblyDispatch &) = delete;
-    /** Default move constructor */
-    NEPoolingAssemblyDispatch(NEPoolingAssemblyDispatch &&);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEPoolingAssemblyDispatch &operator=(const NEPoolingAssemblyDispatch &) = delete;
-    /** Default move assignment operator */
-    NEPoolingAssemblyDispatch &operator=(NEPoolingAssemblyDispatch &&);
-    /** Destructor */
-    ~NEPoolingAssemblyDispatch();
-
-    /** If supported create an assembly routine, else fallback to Compute Library function.
-     *
-     * @param[in]  input  Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] output Output tensor to store the result of pooling. Data types supported: same as @p input.
-     * @param[in]  info   Pooling meta-data
-     */
-    void configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &info);
-
-    /** Indicates whether or not this function can be used to process the given parameters.
-     *
-     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] output Output tensor to store the result of pooling. Data types supported: same as @p input.
-     * @param[in] info   Pooling meta-data
-     *
-     * @return a status.
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &info);
-
-    /** Was the function successfully configured ?
-     *
-     * @return True if the function is configured and ready to run
-     */
-    bool is_configured() const;
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    /** Helper function to allocate memory for the workspace needed by the
-     * assembly kernels
-     *
-     * @param[in] workspace_size Total size of the workspace.
-     * @param[in] alignment      Alignment requirement in bytes.
-     */
-    void allocate_workspace(size_t workspace_size, size_t alignment);
-
-    struct Impl;
-    std::unique_ptr<Impl> _impl;
-
-    MemoryGroup _memory_group{};
-    Tensor      _workspace{};
-};
-
-namespace experimental
-{
-/** Basic function to run pooling assembly kernels */
-class NEPoolingAssemblyDispatch : public INEOperator
-{
-public:
-    /** Constructor */
-    NEPoolingAssemblyDispatch() = default;
-    /** Prevent instances of this class from being copied */
-    NEPoolingAssemblyDispatch(const NEPoolingAssemblyDispatch &) = delete;
-    /** Default move constructor */
-    NEPoolingAssemblyDispatch(NEPoolingAssemblyDispatch &&) = default;
-    /** Prevent instances of this class from being copied */
-    NEPoolingAssemblyDispatch &operator=(const NEPoolingAssemblyDispatch &) = delete;
-    /** Default move assignment operator */
-    NEPoolingAssemblyDispatch &operator=(NEPoolingAssemblyDispatch &&) = default;
-    /** Destructor */
-    ~NEPoolingAssemblyDispatch();
-
-    /** If supported create an assembly routine, else fallback to Compute Library function.
-     *
-     * @param[in]  input  Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] output Output tensor to store the result of pooling. Data types supported: same as @p input.
-     * @param[in]  info   Pooling meta-data
-     */
-    void configure(const ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &info);
-
-    /** Indicates whether or not this function can be used to process the given parameters.
-     *
-     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] output Output tensor to store the result of pooling. Data types supported: same as @p input.
-     * @param[in] info   Pooling meta-data
-     *
-     * @return a status.
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &info);
-    /** Was the function successfully configured ?
-     *
-     * @return True if the function is configured and ready to run
-     */
-    bool is_configured() const;
-    // Run method overriden
-    void run(ITensorPack &tensors) override;
-
-private:
-    bool _is_global_pooling_layer{ false };
-};
-} // namespace experimental
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEPOOLINGASSEMBLYDISPATCH_H */
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index 0c857b5..dd7a3a3 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -23,103 +23,48 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
 
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NEFillBorderKernel.h"
-#include "src/core/NEON/kernels/NEPoolingLayerKernel.h"
-#include "src/runtime/NEON/functions/NEPoolingAssemblyDispatch.h"
+#include "arm_compute/core/Validate.h"
+#include "src/runtime/cpu/operators/CpuPooling.h"
 
 namespace arm_compute
 {
+struct NEPoolingLayer::Impl
+{
+    ITensor                         *src{ nullptr };
+    ITensor                         *dst{ nullptr };
+    ITensor                         *indices{ nullptr };
+    std::shared_ptr<IMemoryManager>  memory_manager{ nullptr };
+    std::unique_ptr<cpu::CpuPooling> op{ nullptr };
+};
+
 NEPoolingLayer::~NEPoolingLayer() = default;
 
 NEPoolingLayer::NEPoolingLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_manager(std::move(memory_manager)), _pooling_layer_kernel(), _border_handler(), _asm_glue(), _is_global_pooling_layer(false), _data_layout(DataLayout::NCHW)
+    : _impl(std::make_unique<Impl>())
 {
+    _impl->memory_manager = std::move(memory_manager);
 }
 
 void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info, ITensor *indices)
 {
-    // Check if we can run assembly kernels. Currently, indices are not supported by those kernels
-    const bool run_optimised = bool(NEPoolingAssemblyDispatch::validate(input->info(), output->info(), pool_info)) && (indices == nullptr);
-
-    if(run_optimised)
-    {
-        _asm_glue = std::make_unique<NEPoolingAssemblyDispatch>(_memory_manager);
-        _asm_glue->configure(input, output, pool_info);
-        ARM_COMPUTE_ERROR_ON(!_asm_glue->is_configured());
-    }
-    else
-    {
-        // Check if we have Global Pooling Layer
-        _is_global_pooling_layer = (input->info()->dimension(0) == pool_info.pool_size.width) && (input->info()->dimension(1) == pool_info.pool_size.height);
-
-        // Get data layout
-        _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : pool_info.data_layout;
-
-        // Configure pooling kernel
-        _pooling_layer_kernel = std::make_unique<NEPoolingLayerKernel>();
-        _pooling_layer_kernel->configure(input, output, pool_info, indices);
-
-        switch(_data_layout)
-        {
-            case DataLayout::NCHW:
-            {
-                // Configure border depending on operation required (quantize border in case of asymmetric data_type)
-                BorderMode border_mode = (!indices && pool_info.pool_type == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
-                PixelValue zero_value((indices) ? std::numeric_limits<int>::min() : 0.f);
-                if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding)
-                {
-                    zero_value = PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
-                }
-                _border_handler = std::make_unique<NEFillBorderKernel>();
-                _border_handler->configure(input, _pooling_layer_kernel->border_size(), border_mode, zero_value);
-                break;
-            }
-            case DataLayout::NHWC:
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Data layout not supported");
-        }
-    }
+    _impl->src     = input;
+    _impl->dst     = output;
+    _impl->indices = indices;
+    _impl->op      = std::make_unique<cpu::CpuPooling>(_impl->memory_manager);
+    _impl->op->configure(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr);
 }
 
 Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
 {
-    const bool run_optimised = bool(NEPoolingAssemblyDispatch::validate(input, output, pool_info)) && (indices == nullptr);
-
-    if(run_optimised)
-    {
-        return Status{};
-    }
-
-    return NEPoolingLayerKernel::validate(input, output, pool_info, indices);
+    return cpu::CpuPooling::validate(input, output, pool_info, indices);
 }
 
 void NEPoolingLayer::run()
 {
-    if(_asm_glue && _asm_glue->is_configured())
-    {
-        _asm_glue->run();
-    }
-    else
-    {
-        switch(_data_layout)
-        {
-            case DataLayout::NCHW:
-                // Fill border
-                NEScheduler::get().schedule(_border_handler.get(), Window::DimY);
-
-                // Run pooling layer
-                NEScheduler::get().schedule(_pooling_layer_kernel.get(), _is_global_pooling_layer ? Window::DimZ : Window::DimY);
-                break;
-            case DataLayout::NHWC:
-                // Run pooling layer
-                NEScheduler::get().schedule(_pooling_layer_kernel.get(), Window::DimX);
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Data layout not supported");
-        }
-    }
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST_0, _impl->dst);
+    pack.add_tensor(TensorType::ACL_DST_1, _impl->indices);
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuPooling.cpp b/src/runtime/cpu/operators/CpuPooling.cpp
new file mode 100644
index 0000000..0b9b38d
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuPooling.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/cpu/operators/CpuPooling.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/cpu/kernels/CpuPoolingKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+CpuPooling::CpuPooling(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_manager(std::move(memory_manager)), _pooling_layer_kernel(), _border_handler(), _asm_glue(), _is_global_pooling_layer(false), _data_layout(DataLayout::NCHW)
+{
+}
+
+CpuPooling::~CpuPooling() = default;
+
+void CpuPooling::configure(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
+{
+    // Check if we can run assembly kernels. Currently, indices are not supported by those kernels
+    const bool run_optimised = bool(CpuPoolingAssemblyDispatch::validate(input, output, pool_info)) && (indices == nullptr);
+
+    if(run_optimised)
+    {
+        _asm_glue = std::make_unique<CpuPoolingAssemblyDispatch>(_memory_manager);
+        _asm_glue->configure(input, output, pool_info);
+        ARM_COMPUTE_ERROR_ON(!_asm_glue->is_configured());
+    }
+    else
+    {
+        // Check if we have Global Pooling Layer
+        _is_global_pooling_layer = (input->dimension(0) == pool_info.pool_size.width) && (input->dimension(1) == pool_info.pool_size.height);
+
+        // Get data layout
+        _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : pool_info.data_layout;
+
+        // Configure pooling kernel
+        auto k = std::make_unique<kernels::CpuPoolingKernel>();
+        k->configure(input, output, pool_info, indices);
+        _pooling_layer_kernel = std::move(k);
+
+        switch(_data_layout)
+        {
+            case DataLayout::NCHW:
+            {
+                // Configure border depending on operation required (quantize border in case of asymmetric data_type)
+                BorderMode border_mode = (!indices && pool_info.pool_type == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
+                PixelValue zero_value((indices) ? std::numeric_limits<int>::min() : 0.f);
+                if(is_data_type_quantized_asymmetric(input->data_type()) && !pool_info.exclude_padding)
+                {
+                    zero_value = PixelValue(0, input->data_type(), input->quantization_info());
+                }
+                auto b = std::make_unique<NEFillBorderKernel>();
+                b->configure(input, _pooling_layer_kernel->border_size(), border_mode, zero_value);
+                _border_handler = std::move(b);
+                break;
+            }
+            case DataLayout::NHWC:
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Data layout not supported");
+        }
+    }
+}
+
+Status CpuPooling::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+{
+    const bool run_optimised = bool(CpuPoolingAssemblyDispatch::validate(input, output, pool_info)) && (indices == nullptr);
+
+    if(run_optimised)
+    {
+        return Status{};
+    }
+
+    return kernels::CpuPoolingKernel::validate(input, output, pool_info, indices);
+}
+
+void CpuPooling::run(ITensorPack &tensors)
+{
+    if(_asm_glue && _asm_glue->is_configured())
+    {
+        _asm_glue->run(tensors);
+    }
+    else
+    {
+        switch(_data_layout)
+        {
+            case DataLayout::NCHW:
+                // Fill border
+                NEScheduler::get().schedule_op(_border_handler.get(), Window::DimY, _border_handler->window(), tensors);
+
+                // Run pooling layer
+                NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), _is_global_pooling_layer ? Window::DimZ : Window::DimY, _pooling_layer_kernel->window(), tensors);
+                break;
+            case DataLayout::NHWC:
+                // Run pooling layer
+                NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), Window::DimX, _pooling_layer_kernel->window(), tensors);
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Data layout not supported");
+        }
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuPooling.h b/src/runtime/cpu/operators/CpuPooling.h
new file mode 100644
index 0000000..aa607b4
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuPooling.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_POOLING_H
+#define ARM_COMPUTE_CPU_POOLING_H
+
+#include "src/runtime/cpu/ICpuOperator.h"
+
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "src/runtime/cpu/operators/CpuPoolingAssemblyDispatch.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+// Forward Declarations
+struct PoolingLayerInfo;
+
+namespace cpu
+{
+// Forward Declarations
+class CpuPoolingAssemblyDispatch;
+/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if padding size is different from zero)
+ * -# @ref kernels::CpuPoolingKernel
+ * -# @ref CpuPoolingAssemblyDispatch
+ */
+class CpuPooling : public ICpuOperator
+{
+public:
+    /** Constructor */
+    CpuPooling(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CpuPooling(const CpuPooling &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CpuPooling &operator=(const CpuPooling &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    CpuPooling(CpuPooling &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    CpuPooling &operator=(CpuPooling &&) = delete;
+    /** Default destructor */
+    ~CpuPooling();
+    /** Set the src and dst tensors.
+     *
+     * @note F16 is supported for pool sizes 2 and 3 only
+     *
+     * @param[in, out] src       Source tensor info. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out]     dst       Destination tensor info. Data types supported: same as @p src.
+     * @param[in]      pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[out]     indices   (optional) The indices of the maximal values. Data type supported: U32.
+     */
+    void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
+    /** Static function to check if given info will lead to a valid configuration of @ref CpuPooling
+     *
+     * @note F16 is supported for pool sizes 2 and 3 only
+     *
+     * @param[in] src       Source tensor info. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] dst       Destination tensor info. Data types supported: same as @p src.
+     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[in] indices   (optional) Tensor info of the indices of the maximal values. Data type supported: U32.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    std::shared_ptr<IMemoryManager> _memory_manager;
+
+    std::unique_ptr<INEKernel>                  _pooling_layer_kernel;
+    std::unique_ptr<INEKernel>                  _border_handler;
+    std::unique_ptr<CpuPoolingAssemblyDispatch> _asm_glue;
+
+    bool       _is_global_pooling_layer;
+    DataLayout _data_layout;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_POOLING_H */
diff --git a/src/runtime/cpu/operators/CpuPoolingAssemblyDispatch.cpp b/src/runtime/cpu/operators/CpuPoolingAssemblyDispatch.cpp
new file mode 100644
index 0000000..4a56233
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuPoolingAssemblyDispatch.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/cpu/operators/CpuPoolingAssemblyDispatch.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+CpuPoolingAssemblyDispatch::CpuPoolingAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)),
+      _workspace(),
+      _is_global_pooling_layer(false)
+{
+}
+
+CpuPoolingAssemblyDispatch::~CpuPoolingAssemblyDispatch() = default;
+
+void CpuPoolingAssemblyDispatch::configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info)
+{
+    const CPUInfo     &ci          = NEScheduler::get().cpu_info();
+    const unsigned int num_threads = NEScheduler::get().num_threads();
+
+    // If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
+    if(!CpuPoolingAssemblyDispatch::validate(src, dst, info))
+    {
+        return;
+    }
+
+    auto pooling_wrapper = std::make_unique<kernels::CpuPoolingAssemblyWrapperKernel>();
+    ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr);
+    pooling_wrapper->configure(src, dst, info, ci);
+
+    // Check if we have Global Pooling Layer
+    _is_global_pooling_layer = (src->dimension(2) == info.pool_size.width) && (src->dimension(1) == info.pool_size.height);
+
+    // Allocate workspace based on kernel's memory requirements
+    constexpr size_t alignment      = 4096;
+    const size_t     workspace_size = pooling_wrapper->get_working_size(num_threads);
+    _workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment);
+    _memory_group.manage(&_workspace);
+    _workspace.allocator()->allocate();
+
+    _kernel = std::move(pooling_wrapper);
+}
+
+Status CpuPoolingAssemblyDispatch::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info)
+{
+    return kernels::CpuPoolingAssemblyWrapperKernel::validate(src, dst, info);
+}
+
+bool CpuPoolingAssemblyDispatch::is_configured() const
+{
+    return _kernel != nullptr;
+}
+
+void CpuPoolingAssemblyDispatch::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No srcs provided");
+
+    tensors.add_tensor(TensorType::ACL_DST_1, &_workspace);
+
+    if(_is_global_pooling_layer)
+    {
+        NEScheduler::get().schedule_op(_kernel.get(), Window::DimX, _kernel->window(), tensors);
+    }
+    else
+    {
+        NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuPoolingAssemblyDispatch.h b/src/runtime/cpu/operators/CpuPoolingAssemblyDispatch.h
new file mode 100644
index 0000000..353bbe1
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuPoolingAssemblyDispatch.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_POOLING_ASSEMBLY_DISPATCH_H
+#define ARM_COMPUTE_CPU_POOLING_ASSEMBLY_DISPATCH_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "src/runtime/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+class ITensor;
+
+/** Basic function to run pooling assembly kernels */
+class CpuPoolingAssemblyDispatch : public ICpuOperator
+{
+public:
+    /** Constructor */
+    CpuPoolingAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied */
+    CpuPoolingAssemblyDispatch(const CpuPoolingAssemblyDispatch &) = delete;
+    /** Default move constructor */
+    CpuPoolingAssemblyDispatch(CpuPoolingAssemblyDispatch &&) = default;
+    /** Prevent instances of this class from being copied */
+    CpuPoolingAssemblyDispatch &operator=(const CpuPoolingAssemblyDispatch &) = delete;
+    /** Default move assignment operator */
+    CpuPoolingAssemblyDispatch &operator=(CpuPoolingAssemblyDispatch &&) = default;
+    /** Destructor */
+    ~CpuPoolingAssemblyDispatch();
+
+    /** If supported create an assembly routine, else fallback to Compute Library function.
+     *
+     * @param[in]  src  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out] dst  Destination tensor info to store the result of pooling. Data types supported: same as @p src.
+     * @param[in]  info Pooling meta-data
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info);
+
+    /** Indicates whether or not this function can be used to process the given parameters.
+     *
+     * @param[in] src  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] dst  Destination tensor to store the result of pooling. Data types supported: same as @p src.
+     * @param[in] info Pooling meta-data
+     *
+     * @return a status.
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info);
+    /** Was the function successfully configured ?
+     *
+     * @return True if the function is configured and ready to run
+     */
+    bool is_configured() const;
+    // Run method overriden
+    void run(ITensorPack &tensors) override;
+
+private:
+    arm_compute::MemoryGroup _memory_group;
+
+    arm_compute::Tensor _workspace;
+    bool                _is_global_pooling_layer;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_POOLING_ASSEMBLY_DISPATCH_H */