Add CPU Pool3d FP16/32 implementation

- Add implementation for the CPU pooling 3d layer.
- NDHWC data layout support
- Support FP32/FP16.
- Add Pool3d to the operator list.
- Fix CL Pool3d kernel comments to generate the operator list.

Resolves: COMPMID-4671

Signed-off-by: Adnan AlSinan <adnan.alsinan@arm.com>
Change-Id: I92478a154beb12541525b648ed3dd5a58c8f27fa
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7311
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/Android.bp b/Android.bp
index c80bba0..cf279e0 100644
--- a/Android.bp
+++ b/Android.bp
@@ -420,6 +420,7 @@
         "src/cpu/kernels/CpuMulKernel.cpp",
         "src/cpu/kernels/CpuPermuteKernel.cpp",
         "src/cpu/kernels/CpuPool2dKernel.cpp",
+        "src/cpu/kernels/CpuPool3dKernel.cpp",
         "src/cpu/kernels/CpuQuantizeKernel.cpp",
         "src/cpu/kernels/CpuReshapeKernel.cpp",
         "src/cpu/kernels/CpuScaleKernel.cpp",
@@ -507,6 +508,9 @@
         "src/cpu/kernels/pool2d/neon/nchw/all.cpp",
         "src/cpu/kernels/pool2d/neon/qasymm8.cpp",
         "src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp",
+        "src/cpu/kernels/pool3d/neon/fp16.cpp",
+        "src/cpu/kernels/pool3d/neon/fp32.cpp",
+        "src/cpu/kernels/pool3d/neon/impl.cpp",
         "src/cpu/kernels/range/generic/neon/fp16.cpp",
         "src/cpu/kernels/range/generic/neon/fp32.cpp",
         "src/cpu/kernels/range/generic/neon/impl.cpp",
@@ -559,6 +563,7 @@
         "src/cpu/operators/CpuMul.cpp",
         "src/cpu/operators/CpuPermute.cpp",
         "src/cpu/operators/CpuPool2d.cpp",
+        "src/cpu/operators/CpuPool3d.cpp",
         "src/cpu/operators/CpuQuantize.cpp",
         "src/cpu/operators/CpuReshape.cpp",
         "src/cpu/operators/CpuScale.cpp",
@@ -848,6 +853,7 @@
         "src/runtime/NEON/functions/NEPadLayer.cpp",
         "src/runtime/NEON/functions/NEPermute.cpp",
         "src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp",
+        "src/runtime/NEON/functions/NEPooling3dLayer.cpp",
         "src/runtime/NEON/functions/NEPoolingLayer.cpp",
         "src/runtime/NEON/functions/NEPriorBoxLayer.cpp",
         "src/runtime/NEON/functions/NEQLSTMLayer.cpp",
diff --git a/arm_compute/runtime/CL/functions/CLPooling3dLayer.h b/arm_compute/runtime/CL/functions/CLPooling3dLayer.h
index 2e48237..8bad449 100644
--- a/arm_compute/runtime/CL/functions/CLPooling3dLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPooling3dLayer.h
@@ -54,7 +54,7 @@
     CLPooling3dLayer &operator=(CLPooling3dLayer &&) = default;
     /** Set the input and output tensors.
      *
-     * Valid data layout:
+     * Valid data layouts:
      * - NDHWC
      *
      * Valid data type configurations:
diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h
index 562ade1..a679e8c 100644
--- a/arm_compute/runtime/NEON/NEFunctions.h
+++ b/arm_compute/runtime/NEON/NEFunctions.h
@@ -80,6 +80,7 @@
 #include "arm_compute/runtime/NEON/functions/NEPadLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPermute.h"
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
+#include "arm_compute/runtime/NEON/functions/NEPooling3dLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEQLSTMLayer.h"
diff --git a/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h b/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h
new file mode 100644
index 0000000..7b31f91
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEPOOLING3DLAYER_H
+#define ARM_COMPUTE_NEPOOLING3DLAYER_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+class ITensorInfo;
+class IMemoryManager;
+/** Basic function to simulate a pooling 3d layer with the specified pooling operation. This function calls the following kernels:
+ *
+ * -# @ref cpu::CpuPool3d
+ */
+class NEPooling3dLayer : public IFunction
+{
+public:
+    /** Constructor */
+    NEPooling3dLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPooling3dLayer(const NEPooling3dLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPooling3dLayer &operator=(const NEPooling3dLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEPooling3dLayer(NEPooling3dLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEPooling3dLayer &operator=(NEPooling3dLayer &&) = delete;
+    /** Default destructor */
+    ~NEPooling3dLayer();
+    /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NDHWC
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
+     * @note Source tensor is padded with -inf for MAX pooling and 0 otherwise
+     *
+     * @param[in]  input     Source tensor. Data types supported: F16/F32.
+     * @param[out] output    Destination tensor.
+     * @param[in]  pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo.
+     */
+    void configure(const ITensor *input, ITensor *output, const Pooling3dLayerInfo &pool_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEPooling3dLayer
+     *
+     *
+     * @param[in] input     Source tensor info. Data types supported: F16/F32.
+     * @param[in] output    Destination tensor info.
+     * @param[in] pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+}
+#endif /* ARM_COMPUTE_NEPOOLING3DLAYER_H */
diff --git a/arm_compute/runtime/OperatorList.h b/arm_compute/runtime/OperatorList.h
index 4646974..92b5079 100644
--- a/arm_compute/runtime/OperatorList.h
+++ b/arm_compute/runtime/OperatorList.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -720,6 +720,16 @@
  *
  */
 
+/** Pooling3dLayer
+ *
+ * Description:
+ * Function to perform pooling 3D with the specified pooling operation.
+ *
+ * Equivalent Android NNAPI Op:
+ * N/A
+ *
+ */
+
 /** PReluLayer
  *
  * Description:
diff --git a/docs/user_guide/operator_list.dox b/docs/user_guide/operator_list.dox
index 1dfbdf6..ee337d4 100644
--- a/docs/user_guide/operator_list.dox
+++ b/docs/user_guide/operator_list.dox
@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2021 Arm Limited.
+/// Copyright (c) 2021-2022 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///
@@ -2300,6 +2300,36 @@
     <tr><td>F32<td>F32
     </table>
 <tr>
+  <td rowspan="2">Pooling3dLayer
+  <td rowspan="2" style="width:200px;"> Function to perform pooling 3D with the specified pooling operation.
+  <td rowspan="2">
+      <ul>
+       <li>N/A
+      </ul>
+  <td>NEPooling3dLayer
+  <td>
+      <ul>
+       <li>NDHWC
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
+    </table>
+<tr>
+  <td>CLPooling3dLayer
+  <td>
+      <ul>
+       <li>NDHWC
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
+    </table>
+<tr>
   <td rowspan="2">PReluLayer
   <td rowspan="2" style="width:200px;"> Function to compute the activation layer with the PRELU activation function.
   <td rowspan="2">
diff --git a/filelist.json b/filelist.json
index fa43d86..1af856d 100644
--- a/filelist.json
+++ b/filelist.json
@@ -1795,6 +1795,20 @@
           }
         }
       },
+      "Pool3d": {
+        "files": {
+          "common": [
+            "src/cpu/operators/CpuPool3d.cpp",
+            "src/cpu/kernels/CpuPool3dKernel.cpp",
+            "src/runtime/NEON/functions/NEPooling3dLayer.cpp"
+          ],
+          "neon": {
+            "common":[ "src/cpu/kernels/pool3d/neon/impl.cpp" ],
+            "fp16":  [ "src/cpu/kernels/pool3d/neon/fp16.cpp" ],
+            "fp32":  [ "src/cpu/kernels/pool3d/neon/fp32.cpp" ]
+          }
+        }
+      },
       "PRelu": {
         "deps": [ "ElementwiseBinary" ],
         "files": {
diff --git a/src/cpu/kernels/CpuPool3dKernel.cpp b/src/cpu/kernels/CpuPool3dKernel.cpp
new file mode 100644
index 0000000..3321967
--- /dev/null
+++ b/src/cpu/kernels/CpuPool3dKernel.cpp
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuPool3dKernel.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/common/Registrars.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/pool3d/list.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+using namespace misc::shape_calculator;
+
+static const std::vector<CpuPool3dKernel::Pooling3dKernel> available_kernels =
+{
+    {
+        "neon_fp16_ndhwc_poolMxNxD",
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16 && data.isa.fp16); },
+        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_pool3d)
+    },
+
+    {
+        "neon_fp32_ndhwc_poolMxNxD",
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
+        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_pool3d)
+    }
+};
+
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+
+    const auto data_layout = src->data_layout();
+    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int  idx_depth   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::DEPTH);
+
+    const bool         is_global_pooling = pool_info.is_global_pooling;
+    const unsigned int pool_size_x       = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+    const unsigned int pool_size_y       = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+    const unsigned int pool_size_z       = is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth;
+
+    const unsigned int stride_x = pool_info.stride.x();
+    const unsigned int stride_y = pool_info.stride.y();
+    const unsigned int stride_z = pool_info.stride.z();
+
+    ARM_COMPUTE_RETURN_ERROR_ON((pool_size_x == 0) || (pool_size_y == 0) || (pool_size_z == 0));
+    ARM_COMPUTE_RETURN_ERROR_ON((stride_x == 0) || (stride_y == 0) || (stride_z == 0));
+
+    int output_width  = 0;
+    int output_height = 0;
+    int output_depth  = 0;
+
+    std::tie(output_width, output_height, output_depth) = scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], src->tensor_shape()[idx_depth],
+                                                                                      pool_size_x, pool_size_y, pool_size_z, pool_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1), "Calculated output dimension size is invalid");
+
+    if(dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+        TensorInfo out_info(TensorInfo(compute_pool3d_shape(src->tensor_shape(), pool_info), 1, dst->data_type(), DataLayout::NDHWC));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info);
+    }
+
+    const auto *uk = CpuPool3dKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    return Status{};
+}
+} //namespace
+
+void CpuPool3dKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info));
+
+    // dst auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool3d_shape(src->tensor_shape(), pool_info)));
+
+    // Get data layout
+    const auto data_layout = src->data_layout();
+    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int  idx_depth   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::DEPTH);
+
+    // Update pool size in case of global pooling
+    const bool   is_global_pooling = pool_info.is_global_pooling;
+    const Size3D pool_size(
+        is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
+        is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height,
+        is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth);
+
+    const auto *uk = CpuPool3dKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    ARM_COMPUTE_ERROR_ON(uk == nullptr);
+
+    // Set instance variables
+    _pool_info  = pool_info;
+    _run_method = uk->ukernel;
+    _name       = std::string("CpuPool3dKernel").append("/").append(uk->name);
+
+    // Configure kernel window
+    Window win = calculate_max_window(*dst, Steps());
+    ICpuKernel::configure(win);
+}
+
+Status CpuPool3dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info));
+
+    return Status{};
+}
+
+void CpuPool3dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
+
+    const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST_0);
+
+    _run_method(src, dst, _pool_info, window);
+}
+
+const char *CpuPool3dKernel::name() const
+{
+    return _name.c_str();
+}
+
+const std::vector<CpuPool3dKernel::Pooling3dKernel> &CpuPool3dKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/cpu/kernels/CpuPool3dKernel.h b/src/cpu/kernels/CpuPool3dKernel.h
new file mode 100644
index 0000000..f762cfc
--- /dev/null
+++ b/src/cpu/kernels/CpuPool3dKernel.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_POOL3D_KERNEL_H
+#define ARM_COMPUTE_CPU_POOL3D_KERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the kernel to perform Pooling 3D. */
+class CpuPool3dKernel : public ICpuKernel<CpuPool3dKernel>
+{
+private:
+    /* Template function for Pooling 3D NDHWC */
+    using Pooling3dKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, Pooling3dLayerInfo &, const Window &)>::type;
+
+public:
+    CpuPool3dKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool3dKernel);
+    /** Set the src, dst tensor and pooling info.
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
+     * @param[in]  src       Source tensor info. Data types supported: F16/F32.
+     * @param[out] dst       Destination tensor info. Data types supported: Same as @p src.
+     * @param[in]  pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo.
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuPool3dKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+    struct Pooling3dKernel
+    {
+        const char                  *name;
+        const DataTypeISASelectorPtr is_selected;
+        Pooling3dKernelPtr           ukernel;
+    };
+
+    static const std::vector<Pooling3dKernel> &get_available_kernels();
+
+private:
+    Pooling3dLayerInfo _pool_info{};
+    Pooling3dKernelPtr _run_method{ nullptr };
+    std::string        _name{};
+};
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CPU_POOL3D_KERNEL_H */
\ No newline at end of file
diff --git a/src/cpu/kernels/pool3d/list.h b/src/cpu/kernels/pool3d/list.h
new file mode 100644
index 0000000..ece780e
--- /dev/null
+++ b/src/cpu/kernels/pool3d/list.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_POOLING3D_LIST_H
+#define SRC_CORE_NEON_KERNELS_POOLING3D_LIST_H
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_POOLING_KERNEL(func_name) \
+    void func_name(const ITensor *src0, ITensor *dst0, Pooling3dLayerInfo &, const Window &window)
+
+DECLARE_POOLING_KERNEL(neon_fp16_pool3d);
+DECLARE_POOLING_KERNEL(neon_fp32_pool3d);
+
+#undef DECLARE_POOLING_KERNEL
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // SRC_CORE_NEON_KERNELS_POOLING3D_LIST_H
\ No newline at end of file
diff --git a/src/cpu/kernels/pool3d/neon/fp16.cpp b/src/cpu/kernels/pool3d/neon/fp16.cpp
new file mode 100644
index 0000000..b79bcd9
--- /dev/null
+++ b/src/cpu/kernels/pool3d/neon/fp16.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#include "src/cpu/kernels/pool3d/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp16_pool3d(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
+{
+    return poolingMxNxD_fp_neon_ndhwc<float16_t>(src, dst0, pool_info, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
\ No newline at end of file
diff --git a/src/cpu/kernels/pool3d/neon/fp32.cpp b/src/cpu/kernels/pool3d/neon/fp32.cpp
new file mode 100644
index 0000000..2c06a9d
--- /dev/null
+++ b/src/cpu/kernels/pool3d/neon/fp32.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/pool3d/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp32_pool3d(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
+{
+    return poolingMxNxD_fp_neon_ndhwc<float>(src, dst0, pool_info, window);
+}
+} // namespace cpu
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/cpu/kernels/pool3d/neon/impl.cpp b/src/cpu/kernels/pool3d/neon/impl.cpp
new file mode 100644
index 0000000..bb3999b
--- /dev/null
+++ b/src/cpu/kernels/pool3d/neon/impl.cpp
@@ -0,0 +1,460 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "src/cpu/kernels/pool3d/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+inline float calculate_avg_scale(bool exclude_padding, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int pool_size_z, const int upper_bound_w,
+                                 const int upper_bound_h, const int upper_bound_d, const int pad_x, const int pad_y, const int pad_z, const int stride_x, const int stride_y, const int stride_z)
+{
+    // Based on NDHWC
+    int start_x = id[1] * stride_x - pad_x;
+    int start_y = id[2] * stride_y - pad_y;
+    int start_z = id[3] * stride_z - pad_z;
+
+    const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
+    const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
+    const int end_z = std::min(start_z + pool_size_z, upper_bound_d);
+    if(exclude_padding)
+    {
+        start_x = std::max(0, start_x);
+        start_y = std::max(0, start_y);
+        start_z = std::max(0, start_z);
+    }
+    return 1.f / ((end_y - start_y) * (end_x - start_x) * (end_z - start_z));
+}
+
+
+template <typename T>
+void max_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out,
+                                    const int window_start_x, const int window_end_x, const int window_step_x)
+
+{
+    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+    using vector_type = typename vtype::type;
+    using tag_type    = typename vtype::tag_type;
+
+    int pool_stride_x = static_cast<int>(pool_info.stride.width);
+    int pool_stride_y = static_cast<int>(pool_info.stride.height);
+    int pool_stride_z = static_cast<int>(pool_info.stride.depth);
+
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+    const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
+
+    const int pool_pad_top   = static_cast<int>(pool_info.padding.top);
+    const int pool_pad_left  = static_cast<int>(pool_info.padding.left);
+    const int pool_pad_front = static_cast<int>(pool_info.padding.front);
+
+    const int input_dim_w = src->info()->dimension(1);
+    const int input_dim_h = src->info()->dimension(2);
+    const int input_dim_d = src->info()->dimension(3);
+
+    const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
+    const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
+    const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
+    const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
+
+    const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
+
+    Iterator out(dst0, window_out);
+
+    vector_type vres;
+    execute_window_loop(window_out, [&](const Coordinates & id)
+    {
+        // Computing the theoretical input starting/ending points
+        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+
+        const int pool_start_x = std::max(0, -in_idx_width);
+        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+        const int pool_start_y = std::max(0, -in_idx_height);
+        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+        const int pool_start_z = std::max(0, -in_idx_depth);
+        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+        int x_off = window_start_x;
+
+        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
+        {
+            vres = wrapper::vdup_n(static_cast<T>(-std::numeric_limits<float>::infinity()), tag_type());
+            for(int z = pool_start_z; z < pool_end_z; ++z)
+            {
+                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                        const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                        vres                       = wrapper::vmax(vres, data);
+                    }
+                }
+            }
+            // Store result
+            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
+        }
+
+        // Left-overs loop
+        for(; x_off < window_end_x; ++x_off)
+        {
+            T res(0);
+            res = -std::numeric_limits<float>::infinity();
+            for(int z = pool_start_z; z < pool_end_z; ++z)
+            {
+                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                        res                     = std::max(res, data);
+                    }
+                }
+            }
+            // Store result
+            *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+        }
+    },
+    out);
+}
+
+template <typename T>
+void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info,
+                                    const Window &window_out, const int window_start_x, const int window_end_x, const int window_step_x)
+{
+    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+    using vector_type = typename vtype::type;
+    using tag_type    = typename vtype::tag_type;
+
+    int pool_stride_x = static_cast<int>(pool_info.stride.width);
+    int pool_stride_y = static_cast<int>(pool_info.stride.height);
+    int pool_stride_z = static_cast<int>(pool_info.stride.depth);
+
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+    const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
+
+    const int pool_pad_top    = static_cast<int>(pool_info.padding.top);
+    const int pool_pad_bottom = static_cast<int>(pool_info.padding.bottom);
+    const int pool_pad_left   = static_cast<int>(pool_info.padding.left);
+    const int pool_pad_right  = static_cast<int>(pool_info.padding.right);
+    const int pool_pad_front  = static_cast<int>(pool_info.padding.front);
+    const int pool_pad_back   = static_cast<int>(pool_info.padding.back);
+
+    const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const int upper_bound_d = src->info()->dimension(3) + (pool_info.exclude_padding ? 0 : pool_pad_back);
+
+    const int input_dim_w = src->info()->dimension(1);
+    const int input_dim_h = src->info()->dimension(2);
+    const int input_dim_d = src->info()->dimension(3);
+
+    const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
+    const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
+    const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
+    const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
+
+    const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
+
+    Iterator out(dst0, window_out);
+
+    vector_type vres;
+    execute_window_loop(window_out, [&](const Coordinates & id)
+    {
+        // Computing the theoretical input starting/ending points
+        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+
+        const int pool_start_x = std::max(0, -in_idx_width);
+        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+        const int pool_start_y = std::max(0, -in_idx_height);
+        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+        const int pool_start_z = std::max(0, -in_idx_depth);
+        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+        // Calculate scale
+        const float scale = calculate_avg_scale(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
+                                                pool_pad_top, pool_pad_front, pool_stride_x,
+                                                pool_stride_y, pool_stride_z);
+        const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
+
+        int x_off = window_start_x;
+
+        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
+        {
+            // Perform pooling
+            vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
+            for(int z = pool_start_z; z < pool_end_z; ++z)
+            {
+                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                        const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                        vres                       = wrapper::vadd(vres, data);
+                    }
+                }
+            }
+
+            // Divide by scale
+            vres = wrapper::vmul(vres, scale_v);
+
+            // Store result
+            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
+        }
+
+        // Left-overs loop
+        for(; x_off < window_end_x; ++x_off)
+        {
+            T res(0);
+
+            for(int z = pool_start_z; z < pool_end_z; ++z)
+            {
+                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                        res += data;
+                    }
+                }
+            }
+
+            // Divide by scale
+            res *= scale;
+
+            // Store result
+            *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+        }
+    },
+    out);
+}
+
+template <typename T>
+void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info,
+                                   const Window &window_out, const int window_start_x, const int window_end_x, const int window_step_x)
+{
+    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+    using vector_type = typename vtype::type;
+    using tag_type    = typename vtype::tag_type;
+
+    int pool_stride_x = static_cast<int>(pool_info.stride.width);
+    int pool_stride_y = static_cast<int>(pool_info.stride.height);
+    int pool_stride_z = static_cast<int>(pool_info.stride.depth);
+
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+    const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
+
+    const int pool_pad_top    = static_cast<int>(pool_info.padding.top);
+    const int pool_pad_bottom = static_cast<int>(pool_info.padding.bottom);
+    const int pool_pad_left   = static_cast<int>(pool_info.padding.left);
+    const int pool_pad_right  = static_cast<int>(pool_info.padding.right);
+    const int pool_pad_front  = static_cast<int>(pool_info.padding.front);
+    const int pool_pad_back   = static_cast<int>(pool_info.padding.back);
+
+    const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const int upper_bound_d = src->info()->dimension(3) + (pool_info.exclude_padding ? 0 : pool_pad_back);
+
+    const int input_dim_w = src->info()->dimension(1);
+    const int input_dim_h = src->info()->dimension(2);
+    const int input_dim_d = src->info()->dimension(3);
+
+    const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
+    const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
+    const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
+    const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
+
+    const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
+
+    Iterator out(dst0, window_out);
+
+    vector_type vres;
+    execute_window_loop(window_out, [&](const Coordinates & id)
+    {
+        // Computing the theoretical input starting/ending points
+        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+
+        const int pool_start_x = std::max(0, -in_idx_width);
+        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+        const int pool_start_y = std::max(0, -in_idx_height);
+        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+        const int pool_start_z = std::max(0, -in_idx_depth);
+        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+        // Calculate scale
+        const float scale = calculate_avg_scale(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
+                                                pool_pad_top, pool_pad_front, pool_stride_x,
+                                                pool_stride_y, pool_stride_z);
+
+        int x_off = window_start_x;
+
+        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
+        {
+            // Perform pooling
+            vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
+            for(int z = pool_start_z; z < pool_end_z; ++z)
+            {
+                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                        const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                        vres                       = wrapper::vmla(vres, data, data);
+                    }
+                }
+            }
+
+            const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
+
+            // Divide by scale
+            vres = wrapper::vmul(vres, scale_v);
+
+            // Calculate square-root
+            vres = wrapper::vinv(wrapper::vinvsqrt(vres));
+
+            // Store result
+            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
+        }
+
+        // Left-overs loop
+        for(; x_off < window_end_x; ++x_off)
+        {
+            T res(0);
+
+            for(int z = pool_start_z; z < pool_end_z; ++z)
+            {
+                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                        res += data * data;
+                    }
+                }
+            }
+
+            // Divide by scale
+            res *= scale;
+
+            // Square root
+            res = std::sqrt(res);
+
+            // Store result
+            *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+        }
+    },
+    out);
+}
+} // namespace
+
+template <typename T>
+void poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
+{
+    const int     window_start_x = window.x().start();
+    const int     window_end_x   = window.x().end();
+    constexpr int window_step_x  = 16 / sizeof(T);
+    Window        window_out     = window;
+
+    // Needed to handle loop left-over
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    switch(pool_info.pool_type)
+    {
+        case PoolingType::MAX:
+            max_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x);
+            break;
+        case PoolingType::AVG:
+            avg_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x);
+            break;
+        case PoolingType::L2:
+            l2_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Pool operation not supported");
+    }
+}
+
+template void poolingMxNxD_fp_neon_ndhwc<float>(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window);
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+template void poolingMxNxD_fp_neon_ndhwc<float16_t>(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window);
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/pool3d/neon/impl.h b/src/cpu/kernels/pool3d/neon/impl.h
new file mode 100644
index 0000000..829a9bd
--- /dev/null
+++ b/src/cpu/kernels/pool3d/neon/impl.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_POOLING_3D_LAYER_IMPL_H
+#define SRC_CORE_POOLING_3D_LAYER_IMPL_H
+
+#include "arm_compute/core/Helpers.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+class Window;
+struct Pooling3dLayerInfo;
+namespace cpu
+{
+template <typename T>
+void poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window);
+
+} // namespace cpu
+} // namespace arm_compute
+#endif //define SRC_CORE_POOLING_3D_LAYER_IMPL_H
diff --git a/src/cpu/operators/CpuPool3d.cpp b/src/cpu/operators/CpuPool3d.cpp
new file mode 100644
index 0000000..14e4ac6
--- /dev/null
+++ b/src/cpu/operators/CpuPool3d.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuPool3d.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/Scheduler.h"
+#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuPool3dKernel.h"
+
+using namespace arm_compute::experimental;
+
+namespace arm_compute
+{
+namespace cpu
+{
+CpuPool3d::CpuPool3d()
+    : _aux_mem(1)
+{
+}
+
+CpuPool3d::~CpuPool3d() = default;
+
+void CpuPool3d::configure(const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst, pool_info);
+
+    // Configure pooling kernel
+    auto k = std::make_unique<kernels::CpuPool3dKernel>();
+    k->configure(src, dst, pool_info);
+    _kernel = std::move(k);
+}
+
+Status CpuPool3d::validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
+{
+    return kernels::CpuPool3dKernel::validate(src, dst, pool_info);
+}
+
+void CpuPool3d::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided");
+
+    Scheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+}
+
+experimental::MemoryRequirements CpuPool3d::workspace() const
+{
+    return _aux_mem;
+}
+
+} // namespace cpu
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/cpu/operators/CpuPool3d.h b/src/cpu/operators/CpuPool3d.h
new file mode 100644
index 0000000..fc73cf0
--- /dev/null
+++ b/src/cpu/operators/CpuPool3d.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_POOL3D_H
+#define ARM_COMPUTE_CPU_POOL3D_H
+
+#include "arm_compute/core/experimental/Types.h"
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following kernels:
+ *
+ * -# @ref kernels::CpuPool3dKernel
+ */
+class CpuPool3d : public ICpuOperator
+{
+public:
+    CpuPool3d();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool3d);
+    ~CpuPool3d();
+    /** Set the src and dst tensors.
+     *
+     *
+     * @param[in]  src       Source tensor info. Data types supported: F16/F32.
+     * @param[out] dst       Destination tensor info. Data types supported: same as @p src.
+     * @param[in]  pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo.
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuPool3d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    experimental::MemoryRequirements _aux_mem{};
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_POOL3D_H */
diff --git a/src/runtime/NEON/functions/NEPooling3dLayer.cpp b/src/runtime/NEON/functions/NEPooling3dLayer.cpp
new file mode 100644
index 0000000..53f9dbf0
--- /dev/null
+++ b/src/runtime/NEON/functions/NEPooling3dLayer.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEPooling3dLayer.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuPool3d.h"
+
+namespace arm_compute
+{
+struct NEPooling3dLayer::Impl
+{
+    const ITensor                  *src{ nullptr };
+    ITensor                        *dst{ nullptr };
+    std::unique_ptr<cpu::CpuPool3d> op{ nullptr };
+    MemoryGroup                     memory_group{};
+    ITensorPack                     run_pack{};
+    WorkspaceData<Tensor>           workspace_tensors{};
+};
+
+NEPooling3dLayer::~NEPooling3dLayer() = default;
+
+NEPooling3dLayer::NEPooling3dLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _impl(std::make_unique<Impl>())
+{
+    _impl->memory_group = MemoryGroup(std::move(memory_manager));
+}
+
+void NEPooling3dLayer::configure(const ITensor *input, ITensor *output, const Pooling3dLayerInfo &pool_info)
+{
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<cpu::CpuPool3d>();
+    _impl->op->configure(input->info(), output->info(), pool_info);
+
+    _impl->run_pack          = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST_0, _impl->dst } };
+    _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
+}
+
+Status NEPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info)
+{
+    return cpu::CpuPool3d::validate(input, output, pool_info);
+}
+
+void NEPooling3dLayer::run()
+{
+    MemoryGroupResourceScope scope_mg(_impl->memory_group);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->src, _impl->dst);
+    _impl->op->run(_impl->run_pack);
+}
+
+} // namespace arm_compute
\ No newline at end of file
diff --git a/tests/validation/NEON/Pooling3dLayer.cpp b/tests/validation/NEON/Pooling3dLayer.cpp
new file mode 100644
index 0000000..f651394
--- /dev/null
+++ b/tests/validation/NEON/Pooling3dLayer.cpp
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEPooling3dLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/Pooling3dLayerDataset.h"
+#include "tests/datasets/PoolingTypesDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/Pooling3dLayerFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+/** Input data sets for floating-point data types */
+const auto Pooling3dLayerDatasetFP = combine(combine(combine(combine(datasets::PoolingTypes(), framework::dataset::make("PoolingSize", { Size3D(2, 3, 2) })),
+                                                             framework::dataset::make("Stride", { Size3D(1, 1, 1), Size3D(2, 1, 1), Size3D(1, 2, 1), Size3D(2, 2, 1) })),
+                                                     framework::dataset::make("Padding", { Padding3D(0, 1, 0), Padding3D(1, 1, 1) })),
+                                             framework::dataset::make("ExcludePadding", { true, false }));
+
+const auto Pooling3dLayerDatasetFPSmall = combine(combine(combine(combine(datasets::PoolingTypes(), framework::dataset::make("PoolingSize", { Size3D(2, 2, 2), Size3D(3, 3, 3) })),
+                                                                  framework::dataset::make("Stride", { Size3D(2, 2, 2), Size3D(2, 1, 1) })),
+                                                          framework::dataset::make("Padding", { Padding3D(0, 0, 0), Padding3D(1, 1, 1), Padding3D(1, 0, 0) })),
+                                                  framework::dataset::make("ExcludePadding", { true, false }));
+
+using ShapeDataset = framework::dataset::ContainerDataset<std::vector<TensorShape>>;
+
+constexpr AbsoluteTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for 32-bit floating-point type */
+constexpr AbsoluteTolerance<float> tolerance_f16(0.01f);  /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */
+
+} //namespace
+
+TEST_SUITE(NEON)
+TEST_SUITE(Pooling3dLayer)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+    framework::dataset::make("InputInfo", { TensorInfo(TensorShape(2U, 27U, 13U, 4U, 3U), 1, DataType::F32, DataLayout::NDHWC),     // Mismatching data type
+                                            TensorInfo(TensorShape(2U, 27U, 13U, 4U, 2U), 1, DataType::F32, DataLayout::NDHWC),     // Invalid pad/size combination
+                                            TensorInfo(TensorShape(2U, 27U, 13U, 4U, 2U), 1, DataType::F32, DataLayout::NDHWC),     // Invalid pad/size combination
+                                            TensorInfo(TensorShape(2U, 27U, 13U, 4U, 3U), 1, DataType::F32, DataLayout::NDHWC),     // Invalid output shape
+                                            TensorInfo(TensorShape(5U, 13U, 15U, 2U, 3U), 1, DataType::F32, DataLayout::NDHWC),     // Global Pooling
+                                            TensorInfo(TensorShape(13U,13U, 5U, 1U, 2U),  1, DataType::F32, DataLayout::NDHWC),     // Invalid output Global Pooling
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 4U, 4U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 4U, 4U), 1, DataType::F32, DataLayout::NDHWC),     // Invalid data type
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 4U, 4U), 1, DataType::F32, DataLayout::NHWC),      // Invalid data layout
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 5U, 4U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(1U, 16U,  1U, 3U, 4U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 4U, 3U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 4U, 2U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 4U, 3U), 1, DataType::F32, DataLayout::NDHWC),
+                                          }),
+    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(2U, 25U, 11U, 3U, 3U), 1, DataType::F16, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(2U, 30U, 11U, 3U, 2U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(2U, 25U, 16U, 3U, 2U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(2U, 27U, 13U, 3U, 3U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U,  1U,  1U, 1U, 3U), 1, DataType::F32, DataLayout::NDHWC),            // Global pooling applied
+                                            TensorInfo(TensorShape(5U,  2U,  2U, 2U, 2U), 1, DataType::F32, DataLayout::NDHWC),            // Invalid output Global Pooling
+                                            TensorInfo(TensorShape(5U, 12U, 12U, 3U, 4U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 12U, 12U, 3U, 4U), 1, DataType::QASYMM8, DataLayout::NDHWC),        // Invalid data type
+                                            TensorInfo(TensorShape(5U, 12U, 12U, 3U, 4U), 1, DataType::F32, DataLayout::NDHWC),            // Invalid data layout
+                                            TensorInfo(TensorShape(5U,  1U,  1U, 1U, 4U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(1U, 15U, 1U, 2U, 4U), 1, DataType::F32, DataLayout::NDHWC),             // size larger than height
+                                            TensorInfo(TensorShape(5U, 6U, 6U, 2U, 3U),  1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 6U, 6U, 2U, 2U),  1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 6U, 6U, 2U, 3U),  1, DataType::F32, DataLayout::NDHWC),
+                                    })),
+    framework::dataset::make("PoolInfo",  { Pooling3dLayerInfo(PoolingType::AVG, 3, Size3D(1, 1, 1), Padding3D(0, 0, 0)),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(1, 1, 1), Padding3D(2, 0, 0)),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(1, 1, 1), Padding3D(0, 0, 0)),
+                                            Pooling3dLayerInfo(PoolingType::L2,  3, Size3D(1, 1, 1), Padding3D(0, 0, 0)),
+                                            Pooling3dLayerInfo(PoolingType::AVG),
+                                            Pooling3dLayerInfo(PoolingType::MAX),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(), Padding3D(), false),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(1U, 1U, 1U), Padding3D(), false),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(1U, 1U, 1U), Padding3D(), false),
+                                            Pooling3dLayerInfo(PoolingType::AVG),
+                                            Pooling3dLayerInfo(PoolingType::MAX, 2, Size3D(1, 1, 2), Padding3D(0, 0, 0), false),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(2U, 2U, 2U), Padding3D(), false),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 1, Size3D(2U, 2U, 2U), Padding3D(2, 2, 2), true),  // pool size is equal to the padding size
+                                            Pooling3dLayerInfo(PoolingType::AVG, 1, Size3D(2U, 2U, 2U), Padding3D(2, 2, 2), false), // pool size is equal to the padding size
+                                            Pooling3dLayerInfo(PoolingType::AVG, 3, Size3D(2U, 2U, 2U), Padding3D(2,1,2,2,1,2), false, false, DimensionRoundingType::CEIL), // CEIL with asymmetric Padding
+                                            })),
+    framework::dataset::make("Expected", { false, false, false, false, true, false, false, false, false, true , false, true, false, false, false})),
+    input_info, output_info, pool_info, expected)
+{
+    bool is_valid = bool(NEPooling3dLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), pool_info));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+template <typename T>
+using NEPoolingLayer3dFixture = Pooling3dLayerValidationFixture<Tensor, Accessor, NEPooling3dLayer, T>;
+
+template <typename T>
+using NESpecial3dPoolingLayerFixture = SpecialPooling3dLayerValidationFixture<Tensor, Accessor, NEPooling3dLayer, T>;
+
+template <typename T>
+using NEPooling3dLayerGlobalFixture = Pooling3dLayerGlobalValidationFixture<Tensor, Accessor, NEPooling3dLayer, T>;
+
+// clang-format on
+// *INDENT-ON*
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+
+FIXTURE_DATA_TEST_CASE(RunSpecial, NESpecial3dPoolingLayerFixture<float>, framework::DatasetMode::ALL, datasets::Pooling3dLayerDatasetSpecial() * framework::dataset::make("DataType", DataType::F32))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayer3dFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small5dShapes(), combine(Pooling3dLayerDatasetFPSmall,
+                                                                                                            framework::dataset::make("DataType", DataType::F32))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayer3dFixture<float>, framework::DatasetMode::NIGHTLY,
+                       combine(datasets::Large5dShapes(), combine(Pooling3dLayerDatasetFPSmall, framework::dataset::make("DataType", DataType::F32))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE(GlobalPooling)
+// *INDENT-OFF*
+// clang-format off
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayer3dFixture<float>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(3U, 27U, 13U, 4U),
+                                                                             TensorShape(4U, 27U, 13U, 4U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size3D(27, 13, 4) })),
+                                    framework::dataset::make("Strides",  Size3D(1, 1, 1))),
+                                    framework::dataset::make("Paddings", Padding3D(0, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", {false, true})),
+                                    framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunGlobalSmall, NEPooling3dLayerGlobalFixture<float>, framework::DatasetMode::ALL,
+                       combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(27U, 13U, 4U, 3U),
+                                                                             TensorShape(27U, 13U, 4U, 4U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayer3dFixture<float>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(4U, 79U, 37U, 11U),
+                                                                             TensorShape(4U, 79U, 37U, 11U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size3D(79, 37, 11) })),
+                                    framework::dataset::make("Strides",  Size3D(1, 1, 1))),
+                                    framework::dataset::make("Paddings", Padding3D(0, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", {false, true})),
+                                    framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE_END() // GlobalPooling
+TEST_SUITE_END() // FP32
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+TEST_SUITE(FP16)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayer3dFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small5x5Shapes(), combine(Pooling3dLayerDatasetFPSmall,
+                                                                                                           framework::dataset::make("DataType", DataType::F16))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayer3dFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::Large5dShapes(), combine(Pooling3dLayerDatasetFP,
+                                                                                                           framework::dataset::make("DataType",
+                                                                                                                   DataType::F16))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+
+TEST_SUITE(GlobalPooling)
+// *INDENT-OFF*
+// clang-format off
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayer3dFixture<half>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(3U, 27U, 13U, 4U),
+                                                                             TensorShape(4U, 27U, 13U, 4U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size3D(27, 13, 4) })),
+                                    framework::dataset::make("Strides",  Size3D(1, 1, 1))),
+                                    framework::dataset::make("Paddings", Padding3D(0, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", {false, true})),
+                                    framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+
+
+FIXTURE_DATA_TEST_CASE(RunSmallGlobal, NEPooling3dLayerGlobalFixture<half>, framework::DatasetMode::ALL,
+                       combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(27U, 13U, 4U, 3U),
+                                                                             TensorShape(27U, 13U, 4U, 4U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayer3dFixture<half>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(4U, 79U, 37U, 11U),
+                                                                             TensorShape(4U, 79U, 37U, 11U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size3D(79, 37, 11) })),
+                                    framework::dataset::make("Strides",  Size3D(1, 1, 1))),
+                                    framework::dataset::make("Paddings", Padding3D(0, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", false)),
+                                    framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+
+// clang-format on
+// *INDENT-ON*
+TEST_SUITE_END() // GlobalPooling
+TEST_SUITE_END() // FP16
+#endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+TEST_SUITE_END() // Float
+TEST_SUITE_END() // Pooling3dLayer
+TEST_SUITE_END() // NEON
+} // namespace validation
+} // namespace test
+} // namespace arm_compute