Add CPU Pool3d FP16/32 implementation

- Add implementation for the CPU pooling 3d layer.
- NDHWC data layout support
- Support FP32/FP16.
- Add Pool3d to the operator list.
- Fix CL Pool3d kernel comments to generate the operator list.

Resolves: COMPMID-4671

Signed-off-by: Adnan AlSinan <adnan.alsinan@arm.com>
Change-Id: I92478a154beb12541525b648ed3dd5a58c8f27fa
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7311
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
(cherry picked from commit 572659a0e5dd1086b1c7d16fe331ff73d2acd93a)
diff --git a/src/cpu/kernels/CpuPool3dKernel.cpp b/src/cpu/kernels/CpuPool3dKernel.cpp
new file mode 100644
index 0000000..3321967
--- /dev/null
+++ b/src/cpu/kernels/CpuPool3dKernel.cpp
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuPool3dKernel.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/common/Registrars.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/pool3d/list.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+using namespace misc::shape_calculator;
+
+static const std::vector<CpuPool3dKernel::Pooling3dKernel> available_kernels =
+{
+    {
+        "neon_fp16_ndhwc_poolMxNxD",
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16 && data.isa.fp16); },
+        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_pool3d)
+    },
+
+    {
+        "neon_fp32_ndhwc_poolMxNxD",
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
+        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_pool3d)
+    }
+};
+
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+
+    const auto data_layout = src->data_layout();
+    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int  idx_depth   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::DEPTH);
+
+    const bool         is_global_pooling = pool_info.is_global_pooling;
+    const unsigned int pool_size_x       = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+    const unsigned int pool_size_y       = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+    const unsigned int pool_size_z       = is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth;
+
+    const unsigned int stride_x = pool_info.stride.x();
+    const unsigned int stride_y = pool_info.stride.y();
+    const unsigned int stride_z = pool_info.stride.z();
+
+    ARM_COMPUTE_RETURN_ERROR_ON((pool_size_x == 0) || (pool_size_y == 0) || (pool_size_z == 0));
+    ARM_COMPUTE_RETURN_ERROR_ON((stride_x == 0) || (stride_y == 0) || (stride_z == 0));
+
+    int output_width  = 0;
+    int output_height = 0;
+    int output_depth  = 0;
+
+    std::tie(output_width, output_height, output_depth) = scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], src->tensor_shape()[idx_depth],
+                                                                                      pool_size_x, pool_size_y, pool_size_z, pool_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1), "Calculated output dimension size is invalid");
+
+    if(dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+        TensorInfo out_info(TensorInfo(compute_pool3d_shape(src->tensor_shape(), pool_info), 1, dst->data_type(), DataLayout::NDHWC));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info);
+    }
+
+    const auto *uk = CpuPool3dKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    return Status{};
+}
+} //namespace
+
+void CpuPool3dKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info));
+
+    // dst auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool3d_shape(src->tensor_shape(), pool_info)));
+
+    // Get data layout
+    const auto data_layout = src->data_layout();
+    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int  idx_depth   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::DEPTH);
+
+    // Update pool size in case of global pooling
+    const bool   is_global_pooling = pool_info.is_global_pooling;
+    const Size3D pool_size(
+        is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
+        is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height,
+        is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth);
+
+    const auto *uk = CpuPool3dKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    ARM_COMPUTE_ERROR_ON(uk == nullptr);
+
+    // Set instance variables
+    _pool_info  = pool_info;
+    _run_method = uk->ukernel;
+    _name       = std::string("CpuPool3dKernel").append("/").append(uk->name);
+
+    // Configure kernel window
+    Window win = calculate_max_window(*dst, Steps());
+    ICpuKernel::configure(win);
+}
+
+Status CpuPool3dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info));
+
+    return Status{};
+}
+
+void CpuPool3dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
+
+    const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST_0);
+
+    _run_method(src, dst, _pool_info, window);
+}
+
+const char *CpuPool3dKernel::name() const
+{
+    return _name.c_str();
+}
+
+const std::vector<CpuPool3dKernel::Pooling3dKernel> &CpuPool3dKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/cpu/kernels/CpuPool3dKernel.h b/src/cpu/kernels/CpuPool3dKernel.h
new file mode 100644
index 0000000..f762cfc
--- /dev/null
+++ b/src/cpu/kernels/CpuPool3dKernel.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_POOL3D_KERNEL_H
+#define ARM_COMPUTE_CPU_POOL3D_KERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the kernel to perform Pooling 3D. */
+class CpuPool3dKernel : public ICpuKernel<CpuPool3dKernel>
+{
+private:
+    /* Template function for Pooling 3D NDHWC */
+    using Pooling3dKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, Pooling3dLayerInfo &, const Window &)>::type;
+
+public:
+    CpuPool3dKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool3dKernel);
+    /** Set the src, dst tensor and pooling info.
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
+     * @param[in]  src       Source tensor info. Data types supported: F16/F32.
+     * @param[out] dst       Destination tensor info. Data types supported: Same as @p src.
+     * @param[in]  pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo.
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuPool3dKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+    struct Pooling3dKernel
+    {
+        const char                  *name;
+        const DataTypeISASelectorPtr is_selected;
+        Pooling3dKernelPtr           ukernel;
+    };
+
+    static const std::vector<Pooling3dKernel> &get_available_kernels();
+
+private:
+    Pooling3dLayerInfo _pool_info{};
+    Pooling3dKernelPtr _run_method{ nullptr };
+    std::string        _name{};
+};
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CPU_POOL3D_KERNEL_H */
\ No newline at end of file
diff --git a/src/cpu/kernels/pool3d/list.h b/src/cpu/kernels/pool3d/list.h
new file mode 100644
index 0000000..ece780e
--- /dev/null
+++ b/src/cpu/kernels/pool3d/list.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_POOLING3D_LIST_H
+#define SRC_CORE_NEON_KERNELS_POOLING3D_LIST_H
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_POOLING_KERNEL(func_name) \
+    void func_name(const ITensor *src0, ITensor *dst0, Pooling3dLayerInfo &, const Window &window)
+
+DECLARE_POOLING_KERNEL(neon_fp16_pool3d);
+DECLARE_POOLING_KERNEL(neon_fp32_pool3d);
+
+#undef DECLARE_POOLING_KERNEL
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // SRC_CORE_NEON_KERNELS_POOLING3D_LIST_H
\ No newline at end of file
diff --git a/src/cpu/kernels/pool3d/neon/fp16.cpp b/src/cpu/kernels/pool3d/neon/fp16.cpp
new file mode 100644
index 0000000..b79bcd9
--- /dev/null
+++ b/src/cpu/kernels/pool3d/neon/fp16.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#include "src/cpu/kernels/pool3d/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp16_pool3d(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
+{
+    return poolingMxNxD_fp_neon_ndhwc<float16_t>(src, dst0, pool_info, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
\ No newline at end of file
diff --git a/src/cpu/kernels/pool3d/neon/fp32.cpp b/src/cpu/kernels/pool3d/neon/fp32.cpp
new file mode 100644
index 0000000..2c06a9d
--- /dev/null
+++ b/src/cpu/kernels/pool3d/neon/fp32.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/pool3d/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp32_pool3d(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
+{
+    return poolingMxNxD_fp_neon_ndhwc<float>(src, dst0, pool_info, window);
+}
+} // namespace cpu
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/cpu/kernels/pool3d/neon/impl.cpp b/src/cpu/kernels/pool3d/neon/impl.cpp
new file mode 100644
index 0000000..bb3999b
--- /dev/null
+++ b/src/cpu/kernels/pool3d/neon/impl.cpp
@@ -0,0 +1,460 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "src/cpu/kernels/pool3d/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+inline float calculate_avg_scale(bool exclude_padding, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int pool_size_z, const int upper_bound_w,
+                                 const int upper_bound_h, const int upper_bound_d, const int pad_x, const int pad_y, const int pad_z, const int stride_x, const int stride_y, const int stride_z)
+{
+    // Based on NDHWC
+    int start_x = id[1] * stride_x - pad_x;
+    int start_y = id[2] * stride_y - pad_y;
+    int start_z = id[3] * stride_z - pad_z;
+
+    const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
+    const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
+    const int end_z = std::min(start_z + pool_size_z, upper_bound_d);
+    if(exclude_padding)
+    {
+        start_x = std::max(0, start_x);
+        start_y = std::max(0, start_y);
+        start_z = std::max(0, start_z);
+    }
+    return 1.f / ((end_y - start_y) * (end_x - start_x) * (end_z - start_z));
+}
+
+
+template <typename T>
+void max_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out,
+                                    const int window_start_x, const int window_end_x, const int window_step_x)
+
+{
+    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+    using vector_type = typename vtype::type;
+    using tag_type    = typename vtype::tag_type;
+
+    int pool_stride_x = static_cast<int>(pool_info.stride.width);
+    int pool_stride_y = static_cast<int>(pool_info.stride.height);
+    int pool_stride_z = static_cast<int>(pool_info.stride.depth);
+
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+    const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
+
+    const int pool_pad_top   = static_cast<int>(pool_info.padding.top);
+    const int pool_pad_left  = static_cast<int>(pool_info.padding.left);
+    const int pool_pad_front = static_cast<int>(pool_info.padding.front);
+
+    const int input_dim_w = src->info()->dimension(1);
+    const int input_dim_h = src->info()->dimension(2);
+    const int input_dim_d = src->info()->dimension(3);
+
+    const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
+    const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
+    const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
+    const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
+
+    const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
+
+    Iterator out(dst0, window_out);
+
+    vector_type vres;
+    execute_window_loop(window_out, [&](const Coordinates & id)
+    {
+        // Computing the theoretical input starting/ending points
+        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+
+        const int pool_start_x = std::max(0, -in_idx_width);
+        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+        const int pool_start_y = std::max(0, -in_idx_height);
+        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+        const int pool_start_z = std::max(0, -in_idx_depth);
+        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+        int x_off = window_start_x;
+
+        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
+        {
+            vres = wrapper::vdup_n(static_cast<T>(-std::numeric_limits<float>::infinity()), tag_type());
+            for(int z = pool_start_z; z < pool_end_z; ++z)
+            {
+                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                        const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                        vres                       = wrapper::vmax(vres, data);
+                    }
+                }
+            }
+            // Store result
+            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
+        }
+
+        // Left-overs loop
+        for(; x_off < window_end_x; ++x_off)
+        {
+            T res(0);
+            res = -std::numeric_limits<float>::infinity();
+            for(int z = pool_start_z; z < pool_end_z; ++z)
+            {
+                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                        res                     = std::max(res, data);
+                    }
+                }
+            }
+            // Store result
+            *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+        }
+    },
+    out);
+}
+
+template <typename T>
+void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info,
+                                    const Window &window_out, const int window_start_x, const int window_end_x, const int window_step_x)
+{
+    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+    using vector_type = typename vtype::type;
+    using tag_type    = typename vtype::tag_type;
+
+    int pool_stride_x = static_cast<int>(pool_info.stride.width);
+    int pool_stride_y = static_cast<int>(pool_info.stride.height);
+    int pool_stride_z = static_cast<int>(pool_info.stride.depth);
+
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+    const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
+
+    const int pool_pad_top    = static_cast<int>(pool_info.padding.top);
+    const int pool_pad_bottom = static_cast<int>(pool_info.padding.bottom);
+    const int pool_pad_left   = static_cast<int>(pool_info.padding.left);
+    const int pool_pad_right  = static_cast<int>(pool_info.padding.right);
+    const int pool_pad_front  = static_cast<int>(pool_info.padding.front);
+    const int pool_pad_back   = static_cast<int>(pool_info.padding.back);
+
+    const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const int upper_bound_d = src->info()->dimension(3) + (pool_info.exclude_padding ? 0 : pool_pad_back);
+
+    const int input_dim_w = src->info()->dimension(1);
+    const int input_dim_h = src->info()->dimension(2);
+    const int input_dim_d = src->info()->dimension(3);
+
+    const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
+    const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
+    const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
+    const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
+
+    const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
+
+    Iterator out(dst0, window_out);
+
+    vector_type vres;
+    execute_window_loop(window_out, [&](const Coordinates & id)
+    {
+        // Computing the theoretical input starting/ending points
+        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+
+        const int pool_start_x = std::max(0, -in_idx_width);
+        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+        const int pool_start_y = std::max(0, -in_idx_height);
+        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+        const int pool_start_z = std::max(0, -in_idx_depth);
+        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+        // Calculate scale
+        const float scale = calculate_avg_scale(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
+                                                pool_pad_top, pool_pad_front, pool_stride_x,
+                                                pool_stride_y, pool_stride_z);
+        const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
+
+        int x_off = window_start_x;
+
+        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
+        {
+            // Perform pooling
+            vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
+            for(int z = pool_start_z; z < pool_end_z; ++z)
+            {
+                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                        const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                        vres                       = wrapper::vadd(vres, data);
+                    }
+                }
+            }
+
+            // Divide by scale
+            vres = wrapper::vmul(vres, scale_v);
+
+            // Store result
+            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
+        }
+
+        // Left-overs loop
+        for(; x_off < window_end_x; ++x_off)
+        {
+            T res(0);
+
+            for(int z = pool_start_z; z < pool_end_z; ++z)
+            {
+                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                        res += data;
+                    }
+                }
+            }
+
+            // Divide by scale
+            res *= scale;
+
+            // Store result
+            *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+        }
+    },
+    out);
+}
+
+template <typename T>
+void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info,
+                                   const Window &window_out, const int window_start_x, const int window_end_x, const int window_step_x)
+{
+    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+    using vector_type = typename vtype::type;
+    using tag_type    = typename vtype::tag_type;
+
+    int pool_stride_x = static_cast<int>(pool_info.stride.width);
+    int pool_stride_y = static_cast<int>(pool_info.stride.height);
+    int pool_stride_z = static_cast<int>(pool_info.stride.depth);
+
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+    const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
+
+    const int pool_pad_top    = static_cast<int>(pool_info.padding.top);
+    const int pool_pad_bottom = static_cast<int>(pool_info.padding.bottom);
+    const int pool_pad_left   = static_cast<int>(pool_info.padding.left);
+    const int pool_pad_right  = static_cast<int>(pool_info.padding.right);
+    const int pool_pad_front  = static_cast<int>(pool_info.padding.front);
+    const int pool_pad_back   = static_cast<int>(pool_info.padding.back);
+
+    const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const int upper_bound_d = src->info()->dimension(3) + (pool_info.exclude_padding ? 0 : pool_pad_back);
+
+    const int input_dim_w = src->info()->dimension(1);
+    const int input_dim_h = src->info()->dimension(2);
+    const int input_dim_d = src->info()->dimension(3);
+
+    const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
+    const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
+    const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
+    const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
+
+    const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
+
+    Iterator out(dst0, window_out);
+
+    vector_type vres;
+    execute_window_loop(window_out, [&](const Coordinates & id)
+    {
+        // Computing the theoretical input starting/ending points
+        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+
+        const int pool_start_x = std::max(0, -in_idx_width);
+        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+        const int pool_start_y = std::max(0, -in_idx_height);
+        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+        const int pool_start_z = std::max(0, -in_idx_depth);
+        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+        // Calculate scale
+        const float scale = calculate_avg_scale(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
+                                                pool_pad_top, pool_pad_front, pool_stride_x,
+                                                pool_stride_y, pool_stride_z);
+
+        int x_off = window_start_x;
+
+        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
+        {
+            // Perform pooling
+            vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
+            for(int z = pool_start_z; z < pool_end_z; ++z)
+            {
+                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                        const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                        vres                       = wrapper::vmla(vres, data, data);
+                    }
+                }
+            }
+
+            const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
+
+            // Divide by scale
+            vres = wrapper::vmul(vres, scale_v);
+
+            // Calculate square-root
+            vres = wrapper::vinv(wrapper::vinvsqrt(vres));
+
+            // Store result
+            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
+        }
+
+        // Left-overs loop
+        for(; x_off < window_end_x; ++x_off)
+        {
+            T res(0);
+
+            for(int z = pool_start_z; z < pool_end_z; ++z)
+            {
+                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                        res += data * data;
+                    }
+                }
+            }
+
+            // Divide by scale
+            res *= scale;
+
+            // Square root
+            res = std::sqrt(res);
+
+            // Store result
+            *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+        }
+    },
+    out);
+}
+} // namespace
+
+template <typename T>
+void poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
+{
+    const int     window_start_x = window.x().start();
+    const int     window_end_x   = window.x().end();
+    constexpr int window_step_x  = 16 / sizeof(T);
+    Window        window_out     = window;
+
+    // Needed to handle loop left-over
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    switch(pool_info.pool_type)
+    {
+        case PoolingType::MAX:
+            max_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x);
+            break;
+        case PoolingType::AVG:
+            avg_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x);
+            break;
+        case PoolingType::L2:
+            l2_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Pool operation not supported");
+    }
+}
+
+template void poolingMxNxD_fp_neon_ndhwc<float>(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window);
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+template void poolingMxNxD_fp_neon_ndhwc<float16_t>(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window);
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/pool3d/neon/impl.h b/src/cpu/kernels/pool3d/neon/impl.h
new file mode 100644
index 0000000..829a9bd
--- /dev/null
+++ b/src/cpu/kernels/pool3d/neon/impl.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_POOLING_3D_LAYER_IMPL_H
+#define SRC_CORE_POOLING_3D_LAYER_IMPL_H
+
+#include "arm_compute/core/Helpers.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+class Window;
+struct Pooling3dLayerInfo;
+namespace cpu
+{
+template <typename T>
+void poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window);
+
+} // namespace cpu
+} // namespace arm_compute
+#endif //define SRC_CORE_POOLING_3D_LAYER_IMPL_H
diff --git a/src/cpu/operators/CpuPool3d.cpp b/src/cpu/operators/CpuPool3d.cpp
new file mode 100644
index 0000000..14e4ac6
--- /dev/null
+++ b/src/cpu/operators/CpuPool3d.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuPool3d.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/Scheduler.h"
+#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuPool3dKernel.h"
+
+using namespace arm_compute::experimental;
+
+namespace arm_compute
+{
+namespace cpu
+{
+CpuPool3d::CpuPool3d()
+    : _aux_mem(1)
+{
+}
+
+CpuPool3d::~CpuPool3d() = default;
+
+void CpuPool3d::configure(const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst, pool_info);
+
+    // Configure pooling kernel
+    auto k = std::make_unique<kernels::CpuPool3dKernel>();
+    k->configure(src, dst, pool_info);
+    _kernel = std::move(k);
+}
+
+Status CpuPool3d::validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
+{
+    return kernels::CpuPool3dKernel::validate(src, dst, pool_info);
+}
+
+void CpuPool3d::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided");
+
+    Scheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+}
+
+experimental::MemoryRequirements CpuPool3d::workspace() const
+{
+    return _aux_mem;
+}
+
+} // namespace cpu
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/cpu/operators/CpuPool3d.h b/src/cpu/operators/CpuPool3d.h
new file mode 100644
index 0000000..fc73cf0
--- /dev/null
+++ b/src/cpu/operators/CpuPool3d.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_POOL3D_H
+#define ARM_COMPUTE_CPU_POOL3D_H
+
+#include "arm_compute/core/experimental/Types.h"
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following kernels:
+ *
+ * -# @ref kernels::CpuPool3dKernel
+ */
+class CpuPool3d : public ICpuOperator
+{
+public:
+    CpuPool3d();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool3d);
+    ~CpuPool3d();
+    /** Set the src and dst tensors.
+     *
+     *
+     * @param[in]  src       Source tensor info. Data types supported: F16/F32.
+     * @param[out] dst       Destination tensor info. Data types supported: same as @p src.
+     * @param[in]  pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo.
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuPool3d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    experimental::MemoryRequirements _aux_mem{};
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_POOL3D_H */