Add support for int8 CpuPool3d

- Add implementation for the CPU pooling 3d layer.
- NDHWC data layout support.
- Support QASYMM8/QASYMM8_SIGNED.
- Add Pooling helper file for Pool3d/2d common functions.

Resolves COMPMID-4668

Change-Id: Iadf042036b076099c2353d6e2fe9fc623bc263d8
Signed-off-by: Adnan AlSinan <adnan.alsinan@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7387
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/cpu/kernels/CpuPool3dKernel.cpp b/src/cpu/kernels/CpuPool3dKernel.cpp
index 3321967..1305f7c 100644
--- a/src/cpu/kernels/CpuPool3dKernel.cpp
+++ b/src/cpu/kernels/CpuPool3dKernel.cpp
@@ -44,11 +44,20 @@
 static const std::vector<CpuPool3dKernel::Pooling3dKernel> available_kernels =
 {
     {
+        "neon_qu8_ndhwc_poolMxNxD",
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
+        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_q8_pool3d)
+    },
+    {
+        "neon_qs8_ndhwc_poolMxNxD",
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_q8_signed_pool3d)
+    },
+    {
         "neon_fp16_ndhwc_poolMxNxD",
         [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16 && data.isa.fp16); },
         REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_pool3d)
     },
-
     {
         "neon_fp32_ndhwc_poolMxNxD",
         [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
@@ -61,7 +70,11 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported");
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) && (!pool_info.exclude_padding
+                                                                                && (pool_info.pool_type == PoolingType::AVG)),
+                                    "Exclude padding is unsupported for non-float types for Avg op");
 
     const auto data_layout = src->data_layout();
     const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
diff --git a/src/cpu/kernels/CpuPool3dKernel.h b/src/cpu/kernels/CpuPool3dKernel.h
index f762cfc..437f2af 100644
--- a/src/cpu/kernels/CpuPool3dKernel.h
+++ b/src/cpu/kernels/CpuPool3dKernel.h
@@ -51,8 +51,10 @@
      * |:--------------|:--------------|
      * |F16            |F16            |
      * |F32            |F32            |
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
      *
-     * @param[in]  src       Source tensor info. Data types supported: F16/F32.
+     * @param[in]  src       Source tensor info. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED.
      * @param[out] dst       Destination tensor info. Data types supported: Same as @p src.
      * @param[in]  pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo.
      */
diff --git a/src/cpu/kernels/pool2d/neon/fp16.cpp b/src/cpu/kernels/pool2d/neon/fp16.cpp
index 72f63af..13e21b1 100644
--- a/src/cpu/kernels/pool2d/neon/fp16.cpp
+++ b/src/cpu/kernels/pool2d/neon/fp16.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -199,8 +199,8 @@
             if(pool_info.pool_type != PoolingType::MAX)
             {
                 // Calculate scale
-                const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                        pool_stride_y);
+                const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                               pool_stride_y);
                 const float16x8_t scale_v = vdupq_n_f16(scale);
 
                 // Perform pooling
@@ -260,8 +260,8 @@
             if(pool_info.pool_type != PoolingType::MAX)
             {
                 // Calculate scale
-                const float16_t scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                            pool_stride_y);
+                const float16_t scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                                   pool_stride_y);
 
                 for(int y = pool_start_y; y < pool_end_y; ++y)
                 {
diff --git a/src/cpu/kernels/pool2d/neon/fp32.cpp b/src/cpu/kernels/pool2d/neon/fp32.cpp
index e4261f7..1ed199b 100644
--- a/src/cpu/kernels/pool2d/neon/fp32.cpp
+++ b/src/cpu/kernels/pool2d/neon/fp32.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -193,8 +193,8 @@
                 if(pool_info.pool_type != PoolingType::MAX)
                 {
                     // Calculate scale
-                    const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                            pool_stride_y);
+                    const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                                   pool_stride_y);
                     const float32x4_t scale_v = vdupq_n_f32(scale);
 
                     // Perform pooling
@@ -258,8 +258,8 @@
                 if(pool_info.pool_type != PoolingType::MAX)
                 {
                     // Calculate scale
-                    const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                            pool_stride_y);
+                    const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                                   pool_stride_y);
 
                     for(int y = pool_start_y; y < pool_end_y; ++y)
                     {
diff --git a/src/cpu/kernels/pool2d/neon/nchw/all.cpp b/src/cpu/kernels/pool2d/neon/nchw/all.cpp
index 10cbfc5..77f63c6 100644
--- a/src/cpu/kernels/pool2d/neon/nchw/all.cpp
+++ b/src/cpu/kernels/pool2d/neon/nchw/all.cpp
@@ -124,7 +124,7 @@
         if(pool_info.pool_type != PoolingType::MAX)
         {
             // Calculate scale
-            const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+            const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
                                                     pool_stride_y);
             const float16x4_t scale_v = vdup_n_f16(scale);
             // Perform pooling
@@ -288,7 +288,7 @@
 
             if(pool_info.pool_type != PoolingType::MAX)
             {
-                const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
                                                         pool_stride_y);
                 const float16x4_t scale_v = vdup_n_f16(scale);
 
@@ -343,7 +343,7 @@
         if(pool_info.pool_type != PoolingType::MAX)
         {
             // Calculate scale
-            const float16_t scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+            const float16_t scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
                                                         pool_stride_y);
 
             // Perform pooling
@@ -430,7 +430,7 @@
         if(pool_info.pool_type != PoolingType::MAX)
         {
             // Calculate scale
-            const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h,
+            const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h,
                                                     pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
 
             // Perform pooling
@@ -538,7 +538,7 @@
             if(pool_info.pool_type != PoolingType::MAX)
             {
                 // Calculate scale
-                float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
                                                   pool_stride_y);
                 const float32x2_t scale_v = vdup_n_f32(scale);
 
@@ -618,7 +618,7 @@
         if(pool_info.pool_type != PoolingType::MAX)
         {
             // Calculate scale
-            float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+            float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
                                               pool_stride_y);
             const float32x2_t scale_v = vdup_n_f32(scale);
 
@@ -687,7 +687,7 @@
         if(pool_info.pool_type != PoolingType::MAX)
         {
             // Calculate scale
-            float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+            float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
                                               pool_stride_y);
             const float32x2_t scale_v = vdup_n_f32(scale);
 
diff --git a/src/cpu/kernels/pool2d/neon/quantized.h b/src/cpu/kernels/pool2d/neon/quantized.h
index 386e043..a2cd399 100644
--- a/src/cpu/kernels/pool2d/neon/quantized.h
+++ b/src/cpu/kernels/pool2d/neon/quantized.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,7 @@
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/PoolingHelpers.h"
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -37,148 +38,6 @@
 namespace cpu
 {
 template <typename T>
-inline typename std::enable_if<std::is_same<T, int8_t>::value, int8_t>::type
-quantize(float val, const UniformQuantizationInfo &info)
-{
-    return quantize_qasymm8_signed(val, info);
-}
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8_t>::type
-quantize(float val, const UniformQuantizationInfo &info)
-{
-    return quantize_qasymm8(val, info);
-}
-
-template <typename T>
-inline T vcvtq_q32_f32(float32x4_t values);
-
-template <>
-inline uint32x4_t vcvtq_q32_f32(float32x4_t values)
-{
-    return vcvtq_u32_f32(values);
-}
-
-template <>
-inline int32x4_t vcvtq_q32_f32(float32x4_t values)
-{
-    return vcvtq_s32_f32(values);
-}
-
-template <typename T>
-inline float32x4_t vcvtq_f32_q32(T values);
-
-template <>
-inline float32x4_t vcvtq_f32_q32(uint32x4_t values)
-{
-    return vcvtq_f32_u32(values);
-}
-
-template <>
-inline float32x4_t vcvtq_f32_q32(int32x4_t values)
-{
-    return vcvtq_f32_s32(values);
-}
-
-template <typename Tout>
-inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset);
-
-template <>
-inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
-{
-    const float new_scale = quant_rescale / scale_pooling;
-    return vquantize(acc, UniformQuantizationInfo(new_scale, new_offset));
-}
-
-template <>
-inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
-{
-    const float new_scale = quant_rescale / scale_pooling;
-    return vquantize_signed(acc, UniformQuantizationInfo(new_scale, new_offset));
-}
-
-template <typename Tin, typename Tout>
-inline Tout vrequantize_pooling(Tin vec1, Tin vec2, const UniformQuantizationInfo &requant_qinfo);
-
-template <>
-inline uint8x16_t vrequantize_pooling(uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
-{
-    const float32x4x4_t acc =
-    {
-        {
-            vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))),
-            vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))),
-            vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))),
-            vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))),
-        }
-    };
-    return vquantize(acc, requant_qinfo);
-}
-
-template <>
-inline int8x16_t vrequantize_pooling(int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
-{
-    const float32x4x4_t acc =
-    {
-        {
-            vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))),
-            vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))),
-            vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))),
-            vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))),
-        }
-    };
-    return vquantize_signed(acc, requant_qinfo);
-}
-
-template <typename T>
-inline T vrequantize_pooling(T &vec, const UniformQuantizationInfo &requant_qinfo);
-
-template <>
-inline uint8x8_t vrequantize_pooling(uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
-{
-    const float32x4x2_t acc =
-    {
-        {
-            vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))),
-            vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))),
-        }
-    };
-    return vquantize(acc, requant_qinfo);
-}
-
-template <>
-inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
-{
-    const float32x4x2_t acc =
-    {
-        {
-            vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))),
-            vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))),
-        }
-    };
-    return vquantize_signed(acc, requant_qinfo);
-}
-
-inline float calculate_avg_scale(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
-                                 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    const unsigned int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    int start_x = id[idx_width] * stride_x - pad_x;
-    int start_y = id[idx_height] * stride_y - pad_y;
-
-    const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
-    const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
-    if(exclude_padding)
-    {
-        start_x = std::max(0, start_x);
-        start_y = std::max(0, start_y);
-    }
-    return 1.f / ((end_y - start_y) * (end_x - start_x));
-}
-
-template <typename T>
 void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
 {
     ARM_COMPUTE_UNUSED(dst1);
@@ -250,8 +109,8 @@
                 q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
 
                 // Calculate scale
-                const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                        pool_stride_y);
+                const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                               pool_stride_y);
 
                 // Perform pooling
                 for(int y = pool_start_y; y < pool_end_y; ++y)
@@ -352,8 +211,8 @@
                 q32_t res = static_cast<q32_t>(0.f);
 
                 // Calculate scale
-                const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                        pool_stride_y);
+                const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                               pool_stride_y);
 
                 // Perform pooling
                 for(int y = pool_start_y; y < pool_end_y; ++y)
@@ -531,8 +390,8 @@
     Iterator out(dst0, window);
 
     /** SIMD vector types */
-    using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
-    using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
+    using q8x8_t    = typename wrapper::traits::neon_vector<T, 8>::type;
+    using q8x16_t   = typename wrapper::traits::neon_vector<T, 16>::type;
     using q16_t     = typename wrapper::traits::promote_t<T>;
     using q16x4_t   = typename wrapper::traits::neon_vector<q16_t, 4>::type;
     using q16x8_t   = typename wrapper::traits::neon_vector<q16_t, 8>::type;
@@ -867,8 +726,8 @@
             q32_t sres = 0;
 
             // Calculate scale
-            const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                    pool_stride_y);
+            const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                           pool_stride_y);
 
             // Perform pooling
             for(int y = 0; y < pool_size_y; ++y)
diff --git a/src/cpu/kernels/pool3d/list.h b/src/cpu/kernels/pool3d/list.h
index ece780e..3426360 100644
--- a/src/cpu/kernels/pool3d/list.h
+++ b/src/cpu/kernels/pool3d/list.h
@@ -31,6 +31,8 @@
 #define DECLARE_POOLING_KERNEL(func_name) \
     void func_name(const ITensor *src0, ITensor *dst0, Pooling3dLayerInfo &, const Window &window)
 
+DECLARE_POOLING_KERNEL(neon_q8_pool3d);
+DECLARE_POOLING_KERNEL(neon_q8_signed_pool3d);
 DECLARE_POOLING_KERNEL(neon_fp16_pool3d);
 DECLARE_POOLING_KERNEL(neon_fp32_pool3d);
 
diff --git a/src/cpu/kernels/pool3d/neon/impl.cpp b/src/cpu/kernels/pool3d/neon/impl.cpp
index bb3999b..2b089f3 100644
--- a/src/cpu/kernels/pool3d/neon/impl.cpp
+++ b/src/cpu/kernels/pool3d/neon/impl.cpp
@@ -22,11 +22,10 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include "src/core/helpers/PoolingHelpers.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/pool3d/neon/quantized.h"
 
 #include "src/cpu/kernels/pool3d/neon/impl.h"
 
@@ -36,27 +35,6 @@
 {
 namespace
 {
-inline float calculate_avg_scale(bool exclude_padding, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int pool_size_z, const int upper_bound_w,
-                                 const int upper_bound_h, const int upper_bound_d, const int pad_x, const int pad_y, const int pad_z, const int stride_x, const int stride_y, const int stride_z)
-{
-    // Based on NDHWC
-    int start_x = id[1] * stride_x - pad_x;
-    int start_y = id[2] * stride_y - pad_y;
-    int start_z = id[3] * stride_z - pad_z;
-
-    const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
-    const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
-    const int end_z = std::min(start_z + pool_size_z, upper_bound_d);
-    if(exclude_padding)
-    {
-        start_x = std::max(0, start_x);
-        start_y = std::max(0, start_y);
-        start_z = std::max(0, start_z);
-    }
-    return 1.f / ((end_y - start_y) * (end_x - start_x) * (end_z - start_z));
-}
-
-
 template <typename T>
 void max_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out,
                                     const int window_start_x, const int window_end_x, const int window_step_x)
@@ -227,9 +205,9 @@
         const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
 
         // Calculate scale
-        const float scale = calculate_avg_scale(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
-                                                pool_pad_top, pool_pad_front, pool_stride_x,
-                                                pool_stride_y, pool_stride_z);
+        const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
+                                                       pool_pad_top, pool_pad_front, pool_stride_x,
+                                                       pool_stride_y, pool_stride_z);
         const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
 
         int x_off = window_start_x;
@@ -354,9 +332,9 @@
         const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
 
         // Calculate scale
-        const float scale = calculate_avg_scale(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
-                                                pool_pad_top, pool_pad_front, pool_stride_x,
-                                                pool_stride_y, pool_stride_z);
+        const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
+                                                       pool_pad_top, pool_pad_front, pool_stride_x,
+                                                       pool_stride_y, pool_stride_z);
 
         int x_off = window_start_x;
 
@@ -452,9 +430,33 @@
     }
 }
 
+template <typename T>
+void poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
+{
+    constexpr int window_step_x = 16;
+    Window        window_out    = window;
+
+    // Needed to handle loop left-over
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    switch(pool_info.pool_type)
+    {
+        case PoolingType::MAX:
+            max_poolingMxNxD_q8_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_step_x);
+            break;
+        case PoolingType::AVG:
+            avg_poolingMxNxD_q8_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_step_x);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Pool operation not supported");
+    }
+}
+
 template void poolingMxNxD_fp_neon_ndhwc<float>(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window);
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 template void poolingMxNxD_fp_neon_ndhwc<float16_t>(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window);
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
+template void poolingMxNxD_q8_neon_ndhwc<uint8_t>(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window);
+template void poolingMxNxD_q8_neon_ndhwc<int8_t>(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window);
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/pool3d/neon/impl.h b/src/cpu/kernels/pool3d/neon/impl.h
index 829a9bd..7ad8c8e 100644
--- a/src/cpu/kernels/pool3d/neon/impl.h
+++ b/src/cpu/kernels/pool3d/neon/impl.h
@@ -37,6 +37,8 @@
 template <typename T>
 void poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window);
 
+template <typename T>
+void poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window);
 } // namespace cpu
 } // namespace arm_compute
 #endif //define SRC_CORE_POOLING_3D_LAYER_IMPL_H
diff --git a/src/cpu/kernels/pool3d/neon/qasymm8.cpp b/src/cpu/kernels/pool3d/neon/qasymm8.cpp
new file mode 100644
index 0000000..650a815
--- /dev/null
+++ b/src/cpu/kernels/pool3d/neon/qasymm8.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/pool3d/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_q8_pool3d(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
+{
+    return poolingMxNxD_q8_neon_ndhwc<uint8_t>(src, dst0, pool_info, window);
+}
+} // namespace cpu
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/cpu/kernels/pool3d/neon/qasymm8_signed.cpp b/src/cpu/kernels/pool3d/neon/qasymm8_signed.cpp
new file mode 100644
index 0000000..374b243
--- /dev/null
+++ b/src/cpu/kernels/pool3d/neon/qasymm8_signed.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/pool3d/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_q8_signed_pool3d(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
+{
+    return poolingMxNxD_q8_neon_ndhwc<int8_t>(src, dst0, pool_info, window);
+}
+} // namespace cpu
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/cpu/kernels/pool3d/neon/quantized.h b/src/cpu/kernels/pool3d/neon/quantized.h
new file mode 100644
index 0000000..ac14f5e
--- /dev/null
+++ b/src/cpu/kernels/pool3d/neon/quantized.h
@@ -0,0 +1,390 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H
+#define SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/PoolingHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename T>
+void avg_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out,
+                                    const int window_step_x)
+
+{
+    using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
+    using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
+    using q16_t   = typename wrapper::traits::promote_t<T>;
+    using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
+    using q32_t   = typename wrapper::traits::promote_t<q16_t>;
+    using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
+
+    int pool_stride_x = static_cast<int>(pool_info.stride.width);
+    int pool_stride_y = static_cast<int>(pool_info.stride.height);
+    int pool_stride_z = static_cast<int>(pool_info.stride.depth);
+
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+    const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
+
+    const int pool_pad_top    = static_cast<int>(pool_info.padding.top);
+    const int pool_pad_bottom = static_cast<int>(pool_info.padding.bottom);
+    const int pool_pad_left   = static_cast<int>(pool_info.padding.left);
+    const int pool_pad_right  = static_cast<int>(pool_info.padding.right);
+    const int pool_pad_front  = static_cast<int>(pool_info.padding.front);
+    const int pool_pad_back   = static_cast<int>(pool_info.padding.back);
+
+    const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const int upper_bound_d = src->info()->dimension(3) + (pool_info.exclude_padding ? 0 : pool_pad_back);
+
+    const int input_dim_c = src->info()->dimension(0);
+    const int input_dim_w = src->info()->dimension(1);
+    const int input_dim_h = src->info()->dimension(2);
+    const int input_dim_d = src->info()->dimension(3);
+
+    const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
+    const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
+    const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
+    const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
+
+    const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
+
+    const int window_end_x   = input_dim_c;
+    const int window_start_x = 0;
+
+    Iterator out(dst0, window_out);
+
+    const float32x4_t             half_scale_v = vdupq_n_f32(0.5f);
+    const UniformQuantizationInfo src_qinfo    = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo dst_qinfo    = dst0->info()->quantization_info().uniform();
+
+    const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
+    // "new_offset" doesn't have to consider the "half_scale_v" in its computation
+    // With a requantization performed in a single step there won't be uncertainties introduced
+    const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
+
+    execute_window_loop(window_out, [&](const Coordinates & id)
+    {
+        // Computing the theoretical input starting/ending points
+        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+
+        const int pool_start_x = std::max(0, -in_idx_width);
+        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+        const int pool_start_y = std::max(0, -in_idx_height);
+        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+        const int pool_start_z = std::max(0, -in_idx_depth);
+        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+        // Calculate scale
+        const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
+                                                       pool_pad_top, pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
+
+        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+        int x_off = window_start_x;
+
+        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
+        {
+            q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+            q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+            q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+            q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+
+            // Perform pooling
+            for(int z = pool_start_z; z < pool_end_z; ++z)
+            {
+                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                        const q8x16_t  data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                        const q16x8_t data_q16  = wrapper::vmovl(wrapper::vgetlow(data));
+                        const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
+                        vres1                   = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
+                        vres2                   = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
+                        vres3                   = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
+                        vres4                   = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
+                    }
+                }
+            }
+
+            if(src_qinfo != dst_qinfo)
+            {
+                const float32x4x4_t vres =
+                {
+                    {
+                        vcvtq_f32_q32(vres1),
+                        vcvtq_f32_q32(vres2),
+                        vcvtq_f32_q32(vres3),
+                        vcvtq_f32_q32(vres4),
+                    }
+                };
+                const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
+            }
+            else
+            {
+                const float32x4_t scale_v = vdupq_n_f32(scale);
+                // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+                vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
+                vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
+                vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
+                vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
+
+                const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
+                const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
+            }
+        }
+
+        // Left-overs loop
+        for(; x_off < window_end_x; ++x_off)
+        {
+            q32_t res = static_cast<q32_t>(0.f);
+
+            // Perform pooling
+            for(int z = pool_start_z; z < pool_end_z; ++z)
+            {
+                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                        res += data;
+                    }
+                }
+            }
+
+            if(src_qinfo != dst_qinfo)
+            {
+                const float res_f           = static_cast<float>(res);
+                const float new_scale       = quant_rescale / scale;
+                const auto  requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
+
+                // Store result
+                *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
+            }
+            else
+            {
+                // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+                res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
+
+                // Store result
+                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+            }
+        }
+    },
+    out);
+}
+
+template <typename T>
+void max_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out,
+                                    const int window_step_x)
+
+{
+    using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
+    using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
+
+    const int window_half_step_x = window_step_x / 2;
+
+    int pool_stride_x = static_cast<int>(pool_info.stride.width);
+    int pool_stride_y = static_cast<int>(pool_info.stride.height);
+    int pool_stride_z = static_cast<int>(pool_info.stride.depth);
+
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+    const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
+
+    const int pool_pad_top   = static_cast<int>(pool_info.padding.top);
+    const int pool_pad_left  = static_cast<int>(pool_info.padding.left);
+    const int pool_pad_front = static_cast<int>(pool_info.padding.front);
+
+    const int input_dim_c = src->info()->dimension(0);
+    const int input_dim_w = src->info()->dimension(1);
+    const int input_dim_h = src->info()->dimension(2);
+    const int input_dim_d = src->info()->dimension(3);
+
+    const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
+    const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
+    const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
+    const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
+
+    const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
+
+    const int window_end_x   = input_dim_c;
+    const int window_start_x = 0;
+
+    Iterator out(dst0, window_out);
+
+    const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform();
+
+    const float                   requant_scale  = dst_qinfo.scale / src_qinfo.scale;
+    const int32_t                 requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
+    const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
+
+    execute_window_loop(window_out, [&](const Coordinates & id)
+    {
+        // Computing the theoretical input starting/ending points
+        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+
+        const int pool_start_x = std::max(0, -in_idx_width);
+        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+        const int pool_start_y = std::max(0, -in_idx_height);
+        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+        const int pool_start_z = std::max(0, -in_idx_depth);
+        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+        int x_off = window_start_x;
+
+        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
+        {
+            q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
+
+            // Perform pooling
+            for(int z = pool_start_z; z < pool_end_z; ++z)
+            {
+                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                        const q8x16_t  data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                        vres = wrapper::vmax(vres, data);
+                    }
+                }
+            }
+
+            // Store result
+            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
+                            requant_qinfo) :
+                            vres);
+        }
+
+        // Leftovers using half the window step
+        for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
+        {
+            q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
+
+            // Perform pooling
+            for(int z = pool_start_z; z < pool_end_z; ++z)
+            {
+                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                        const q8x8_t   data     = wrapper::vload(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                        vres = wrapper::vmax(vres, data);
+                    }
+                }
+            }
+
+            // Store result
+            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
+                            (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
+        }
+
+        // Left-overs loop
+        for(; x_off < window_end_x; ++x_off)
+        {
+            T res = std::numeric_limits<T>::min();
+
+            for(int z = pool_start_z; z < pool_end_z; ++z)
+            {
+                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                        res = std::max(res, data);
+                    }
+                }
+            }
+
+            // Store result
+            if(src_qinfo != dst_qinfo)
+            {
+                const float res_f                           = static_cast<float>(res);
+                *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
+            }
+            else
+            {
+                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+            }
+        }
+    },
+    out);
+}
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H
\ No newline at end of file
diff --git a/src/cpu/operators/CpuPool3d.h b/src/cpu/operators/CpuPool3d.h
index fc73cf0..8a73f8a 100644
--- a/src/cpu/operators/CpuPool3d.h
+++ b/src/cpu/operators/CpuPool3d.h
@@ -47,7 +47,7 @@
     /** Set the src and dst tensors.
      *
      *
-     * @param[in]  src       Source tensor info. Data types supported: F16/F32.
+     * @param[in]  src       Source tensor info. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED.
      * @param[out] dst       Destination tensor info. Data types supported: same as @p src.
      * @param[in]  pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo.
      */